diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/added_tokens.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fab80a37259025eaceb9d369ee65326716e25ca8 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/config.json @@ -0,0 +1,198 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": true, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 8, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/generation_config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97496ab5343bb79002b986b43b41ae78796d9343 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9e25d2c7dd35fb520858ee44457e57989c606e7f6027f6a7a12cddca831477 +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c050f4ccbaeff7c27e4cbe12e503516f2dc8830 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167b5c426a7bb824f1672c9bf19964f425a5db29b55542c7b60478e2c7c9fd20 +size 4985976068 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02e8b1a8f0e802587b1133dbb612a729235be1ce --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d7d575e3bf697f3a39df4f25eeea475ee68b8bf5afbcbed481f5b03d45bb7c +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7cdc5da041253f30bfca8dad5f6a64a31333d1b4 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207261884 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer.model b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/trainer_state.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..06e29505de9f8a4afcf7d161aaab1d4400fdac9a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/trainer_state.json @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.01808289, + "auxiliary_loss_mlp": 0.01789735, + "balance_loss_clip": 1.51843524, + "balance_loss_mlp": 1.50449085, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 55.615805515240744, + "language_loss": 2.85281086, + "learning_rate": 0.0, + "loss": 1.92297995, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 16.013601303100586 + }, + { + "auxiliary_loss_clip": 0.01205471, + "auxiliary_loss_mlp": 0.01193367, + "balance_loss_clip": 1.01232576, + "balance_loss_mlp": 1.00329947, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 37.19574783737679, + "language_loss": 1.82724988, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.85123825, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.423368215560913 + }, + { + "auxiliary_loss_clip": 0.01205604, + "auxiliary_loss_mlp": 0.01193181, + "balance_loss_clip": 1.01247215, + "balance_loss_mlp": 1.00320935, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.788821571149235, + "language_loss": 1.57343531, + "learning_rate": 7.073439208833112e-07, + "loss": 1.59742308, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.4155819416046143 + }, + { + "auxiliary_loss_clip": 0.01205417, + "auxiliary_loss_mlp": 0.011929, + "balance_loss_clip": 1.01223958, + "balance_loss_mlp": 1.00273693, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.18709942773174, + "language_loss": 1.67265058, + "learning_rate": 8.925686513863519e-07, + "loss": 1.6966337, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01205581, + "auxiliary_loss_mlp": 0.01193746, + "balance_loss_clip": 1.01239157, + "balance_loss_mlp": 1.00367892, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.158067266567876, + "language_loss": 1.91654134, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.94053459, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.6666457653045654 + }, + { + "auxiliary_loss_clip": 0.01205301, + "auxiliary_loss_mlp": 0.01193867, + "balance_loss_clip": 1.01211655, + "balance_loss_mlp": 1.00379932, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.63149389163462, + "language_loss": 1.6067028, + "learning_rate": 1.153628246576487e-06, + "loss": 1.63069463, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.7003629207611084 + }, + { + "auxiliary_loss_clip": 0.01205379, + "auxiliary_loss_mlp": 0.01193691, + "balance_loss_clip": 1.01218581, + "balance_loss_mlp": 1.00362325, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.52301815967496, + "language_loss": 1.53242016, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.55641079, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.7543461322784424 + }, + { + "auxiliary_loss_clip": 0.0120517, + "auxiliary_loss_mlp": 0.01192918, + "balance_loss_clip": 1.0119983, + "balance_loss_mlp": 1.00294614, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.584771546074798, + "language_loss": 1.43791139, + "learning_rate": 1.338852977079528e-06, + "loss": 1.46189237, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.811591148376465 + }, + { + "auxiliary_loss_clip": 0.01205371, + "auxiliary_loss_mlp": 0.01193719, + "balance_loss_clip": 1.01208091, + "balance_loss_mlp": 1.00374746, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 27.812397477622877, + "language_loss": 1.49796522, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.52195621, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.7613375186920166 + }, + { + "auxiliary_loss_clip": 0.01205338, + "auxiliary_loss_mlp": 0.01193683, + "balance_loss_clip": 1.01217806, + "balance_loss_mlp": 1.00380635, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 24.847648363727057, + "language_loss": 1.44712722, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.4711175, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.723255157470703 + }, + { + "auxiliary_loss_clip": 0.01205532, + "auxiliary_loss_mlp": 0.01193946, + "balance_loss_clip": 1.01234365, + "balance_loss_mlp": 1.00397456, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 24.496889825524256, + "language_loss": 1.44932544, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.47332025, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.675690174102783 + }, + { + "auxiliary_loss_clip": 0.01205207, + "auxiliary_loss_mlp": 0.01193262, + "balance_loss_clip": 1.0119946, + "balance_loss_mlp": 1.00319493, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.55593173723443, + "language_loss": 1.45137358, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.47535825, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.6753618717193604 + }, + { + "auxiliary_loss_clip": 0.01205362, + "auxiliary_loss_mlp": 0.0119271, + "balance_loss_clip": 1.01218629, + "balance_loss_mlp": 1.002738, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.562026180327297, + "language_loss": 1.24616981, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.27015066, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.786144971847534 + }, + { + "auxiliary_loss_clip": 0.01205496, + "auxiliary_loss_mlp": 0.01193802, + "balance_loss_clip": 1.01224446, + "balance_loss_mlp": 1.00382996, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 6.073545189418875, + "language_loss": 1.20746219, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.23145521, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.7250313758850098 + }, + { + "auxiliary_loss_clip": 0.01205526, + "auxiliary_loss_mlp": 0.01193619, + "balance_loss_clip": 1.01232195, + "balance_loss_mlp": 1.00336099, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.268927370635615, + "language_loss": 1.12921047, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.15320206, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 2.726951837539673 + }, + { + "auxiliary_loss_clip": 0.01205324, + "auxiliary_loss_mlp": 0.0119323, + "balance_loss_clip": 1.01211476, + "balance_loss_mlp": 1.00335383, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.452211588831474, + "language_loss": 1.11251044, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.13649607, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 5.590094089508057 + }, + { + "auxiliary_loss_clip": 0.01205362, + "auxiliary_loss_mlp": 0.01193439, + "balance_loss_clip": 1.0121851, + "balance_loss_mlp": 1.00356245, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.48160749573755, + "language_loss": 1.12590718, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.14989519, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 2.687704563140869 + }, + { + "auxiliary_loss_clip": 0.01205282, + "auxiliary_loss_mlp": 0.01192917, + "balance_loss_clip": 1.01215947, + "balance_loss_mlp": 1.00313568, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.9275111527008253, + "language_loss": 1.08059919, + "learning_rate": 1.860972167459798e-06, + "loss": 1.10458112, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 2.7366650104522705 + }, + { + "auxiliary_loss_clip": 0.01205443, + "auxiliary_loss_mlp": 0.01192948, + "balance_loss_clip": 1.01223922, + "balance_loss_mlp": 1.00288081, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 5.578395426623724, + "language_loss": 1.02501798, + "learning_rate": 1.89578346593066e-06, + "loss": 1.04900193, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.677525281906128 + }, + { + "auxiliary_loss_clip": 0.01205287, + "auxiliary_loss_mlp": 0.01192349, + "balance_loss_clip": 1.0122174, + "balance_loss_mlp": 1.00275826, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 5.930399411907217, + "language_loss": 1.16639566, + "learning_rate": 1.928808765521199e-06, + "loss": 1.19037199, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.665472984313965 + }, + { + "auxiliary_loss_clip": 0.01205107, + "auxiliary_loss_mlp": 0.01192977, + "balance_loss_clip": 1.01189637, + "balance_loss_mlp": 1.00290942, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.179526986884907, + "language_loss": 1.05940568, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.08338654, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.6738991737365723 + }, + { + "auxiliary_loss_clip": 0.01204832, + "auxiliary_loss_mlp": 0.01193513, + "balance_loss_clip": 1.01163661, + "balance_loss_mlp": 1.0034461, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.7245843379838655, + "language_loss": 1.05880535, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.08278883, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.694796085357666 + }, + { + "auxiliary_loss_clip": 0.01204935, + "auxiliary_loss_mlp": 0.0119295, + "balance_loss_clip": 1.01171923, + "balance_loss_mlp": 1.00316858, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8292046129800474, + "language_loss": 0.91930234, + "learning_rate": 2.018794797290208e-06, + "loss": 0.94328117, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.6908226013183594 + }, + { + "auxiliary_loss_clip": 0.01205089, + "auxiliary_loss_mlp": 0.01193552, + "balance_loss_clip": 1.01184809, + "balance_loss_mlp": 1.0036757, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.635823185883203, + "language_loss": 1.08232617, + "learning_rate": 2.046196897962839e-06, + "loss": 1.10631251, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.6551077365875244 + }, + { + "auxiliary_loss_clip": 0.01204767, + "auxiliary_loss_mlp": 0.01192842, + "balance_loss_clip": 1.01165104, + "balance_loss_mlp": 1.00287008, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 18.049339768751132, + "language_loss": 1.01338077, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.03735685, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.634430170059204 + }, + { + "auxiliary_loss_clip": 0.01204917, + "auxiliary_loss_mlp": 0.011925, + "balance_loss_clip": 1.0117836, + "balance_loss_mlp": 1.0026238, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.9858753892233367, + "language_loss": 1.06573963, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.08971369, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.6732280254364014 + }, + { + "auxiliary_loss_clip": 0.01204875, + "auxiliary_loss_mlp": 0.01192719, + "balance_loss_clip": 1.01176536, + "balance_loss_mlp": 1.00274742, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 6.191405973465618, + "language_loss": 0.95592296, + "learning_rate": 2.122031762649933e-06, + "loss": 0.97989893, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.7212045192718506 + }, + { + "auxiliary_loss_clip": 0.01205028, + "auxiliary_loss_mlp": 0.01192545, + "balance_loss_clip": 1.01195669, + "balance_loss_mlp": 1.0029546, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7070833274909267, + "language_loss": 1.06396365, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.08793926, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.644355535507202 + }, + { + "auxiliary_loss_clip": 0.01204958, + "auxiliary_loss_mlp": 0.01192955, + "balance_loss_clip": 1.01174927, + "balance_loss_mlp": 1.00317407, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.3633436449124083, + "language_loss": 1.02819538, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.05217457, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.6873764991760254 + }, + { + "auxiliary_loss_clip": 0.01204957, + "auxiliary_loss_mlp": 0.01193131, + "balance_loss_clip": 1.01170015, + "balance_loss_mlp": 1.0030638, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.3130767692567717, + "language_loss": 1.19484079, + "learning_rate": 2.189868360711334e-06, + "loss": 1.21882164, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.654974937438965 + }, + { + "auxiliary_loss_clip": 0.012048, + "auxiliary_loss_mlp": 0.01193481, + "balance_loss_clip": 1.0116241, + "balance_loss_mlp": 1.00360417, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 2.779951230975832, + "language_loss": 1.02510059, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.04908347, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.738346815109253 + }, + { + "auxiliary_loss_clip": 0.01204694, + "auxiliary_loss_mlp": 0.01193237, + "balance_loss_clip": 1.01160514, + "balance_loss_mlp": 1.00326478, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.2783701466611115, + "language_loss": 0.95492786, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.97890711, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.7171266078948975 + }, + { + "auxiliary_loss_clip": 0.01204896, + "auxiliary_loss_mlp": 0.01193167, + "balance_loss_clip": 1.01173234, + "balance_loss_mlp": 1.00319529, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.156447476050876, + "language_loss": 0.9543398, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.97832048, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.6411187648773193 + }, + { + "auxiliary_loss_clip": 0.01204722, + "auxiliary_loss_mlp": 0.0119311, + "balance_loss_clip": 1.01165426, + "balance_loss_mlp": 1.00342369, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 2.135857935110165, + "language_loss": 0.91593605, + "learning_rate": 2.270454923596497e-06, + "loss": 0.93991435, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.701514959335327 + }, + { + "auxiliary_loss_clip": 0.01203964, + "auxiliary_loss_mlp": 0.0119283, + "balance_loss_clip": 1.01079202, + "balance_loss_mlp": 1.00295329, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.444178275090244, + "language_loss": 0.76729846, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.79126638, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 2.901289701461792 + }, + { + "auxiliary_loss_clip": 0.01204113, + "auxiliary_loss_mlp": 0.01192266, + "balance_loss_clip": 1.01099181, + "balance_loss_mlp": 1.00248504, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.4483718870247886, + "language_loss": 0.88792229, + "learning_rate": 2.307256493152974e-06, + "loss": 0.9118861, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.6483566761016846 + }, + { + "auxiliary_loss_clip": 0.01204157, + "auxiliary_loss_mlp": 0.01192672, + "balance_loss_clip": 1.01099038, + "balance_loss_mlp": 1.00289083, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 2.6074569256529405, + "language_loss": 0.93115824, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.95512652, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.6835899353027344 + }, + { + "auxiliary_loss_clip": 0.01204222, + "auxiliary_loss_mlp": 0.01193249, + "balance_loss_clip": 1.01105762, + "balance_loss_mlp": 1.00365853, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.463281937458501, + "language_loss": 1.04055262, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.06452751, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.7014873027801514 + }, + { + "auxiliary_loss_clip": 0.01204096, + "auxiliary_loss_mlp": 0.01192368, + "balance_loss_clip": 1.01093769, + "balance_loss_mlp": 1.00258732, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.4871980973872527, + "language_loss": 0.85361582, + "learning_rate": 2.358792165262154e-06, + "loss": 0.87758052, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.6969797611236572 + }, + { + "auxiliary_loss_clip": 0.01203999, + "auxiliary_loss_mlp": 0.0119293, + "balance_loss_clip": 1.01075625, + "balance_loss_mlp": 1.00295806, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.657599865500169, + "language_loss": 0.90388024, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.92784947, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.6830012798309326 + }, + { + "auxiliary_loss_clip": 0.01203919, + "auxiliary_loss_mlp": 0.01192884, + "balance_loss_clip": 1.01076436, + "balance_loss_mlp": 1.00310302, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 4.520474301640525, + "language_loss": 0.93305707, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.95702505, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.6321918964385986 + }, + { + "auxiliary_loss_clip": 0.01203732, + "auxiliary_loss_mlp": 0.01192078, + "balance_loss_clip": 1.0105648, + "balance_loss_mlp": 1.0024879, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 2.626840877297585, + "language_loss": 0.97431576, + "learning_rate": 2.4065067449483835e-06, + "loss": 0.99827391, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.642505168914795 + }, + { + "auxiliary_loss_clip": 0.01203775, + "auxiliary_loss_mlp": 0.01192336, + "balance_loss_clip": 1.01068282, + "balance_loss_mlp": 1.00265002, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.5425592844384237, + "language_loss": 0.97617602, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00013709, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.6871957778930664 + }, + { + "auxiliary_loss_clip": 0.01204159, + "auxiliary_loss_mlp": 0.01192606, + "balance_loss_clip": 1.01092005, + "balance_loss_mlp": 1.00253868, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.698093645508543, + "language_loss": 0.93478096, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.95874864, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.633312225341797 + }, + { + "auxiliary_loss_clip": 0.01204005, + "auxiliary_loss_mlp": 0.01192537, + "balance_loss_clip": 1.01083279, + "balance_loss_mlp": 1.00256491, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.4016336820465343, + "language_loss": 0.98746669, + "learning_rate": 2.450927955901469e-06, + "loss": 1.01143217, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.676567316055298 + }, + { + "auxiliary_loss_clip": 0.01203941, + "auxiliary_loss_mlp": 0.01191938, + "balance_loss_clip": 1.01083422, + "balance_loss_mlp": 1.00206113, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 2.175514178366731, + "language_loss": 1.02591896, + "learning_rate": 2.465079122983384e-06, + "loss": 1.04987776, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.718069553375244 + }, + { + "auxiliary_loss_clip": 0.01203712, + "auxiliary_loss_mlp": 0.01191854, + "balance_loss_clip": 1.01051188, + "balance_loss_mlp": 1.00216818, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.7854599480678677, + "language_loss": 0.88101447, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.90497005, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.8197357654571533 + }, + { + "auxiliary_loss_clip": 0.01203452, + "auxiliary_loss_mlp": 0.01191952, + "balance_loss_clip": 1.01037359, + "balance_loss_mlp": 1.00236154, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 1.79691231508388, + "language_loss": 0.87629187, + "learning_rate": 2.492481223656015e-06, + "loss": 0.9002459, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.7397544384002686 + }, + { + "auxiliary_loss_clip": 0.01203287, + "auxiliary_loss_mlp": 0.01191877, + "balance_loss_clip": 1.0102036, + "balance_loss_mlp": 1.0020963, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.3209080492164973, + "language_loss": 0.89651233, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92046398, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.7005465030670166 + }, + { + "auxiliary_loss_clip": 0.0120343, + "auxiliary_loss_mlp": 0.01192758, + "balance_loss_clip": 1.01026177, + "balance_loss_mlp": 1.00278568, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.060228429960853, + "language_loss": 0.9089548, + "learning_rate": 2.51876455396287e-06, + "loss": 0.9329167, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.7077152729034424 + }, + { + "auxiliary_loss_clip": 0.01203368, + "auxiliary_loss_mlp": 0.01192188, + "balance_loss_clip": 1.0101763, + "balance_loss_mlp": 1.00221574, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 2.9847389654810343, + "language_loss": 0.86908394, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.89303946, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.7365691661834717 + }, + { + "auxiliary_loss_clip": 0.01203096, + "auxiliary_loss_mlp": 0.0119259, + "balance_loss_clip": 1.01000011, + "balance_loss_mlp": 1.00280929, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 1.9903723704322411, + "language_loss": 0.95080179, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.97475874, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.8968496322631836 + }, + { + "auxiliary_loss_clip": 0.01203345, + "auxiliary_loss_mlp": 0.01192553, + "balance_loss_clip": 1.01022291, + "balance_loss_mlp": 1.00277185, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 10.25822232091138, + "language_loss": 0.92182004, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.94577903, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.0120328, + "auxiliary_loss_mlp": 0.01192615, + "balance_loss_clip": 1.01017284, + "balance_loss_mlp": 1.00292969, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.4753050995090855, + "language_loss": 0.82903504, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85299402, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 5.584779262542725 + }, + { + "auxiliary_loss_clip": 0.01203079, + "auxiliary_loss_mlp": 0.01192588, + "balance_loss_clip": 1.00997913, + "balance_loss_mlp": 1.00271189, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.532057006731712, + "language_loss": 0.81218266, + "learning_rate": 2.580130221340046e-06, + "loss": 0.83613932, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 2.7615065574645996 + }, + { + "auxiliary_loss_clip": 0.01202935, + "auxiliary_loss_mlp": 0.01192201, + "balance_loss_clip": 1.00977731, + "balance_loss_mlp": 1.0023241, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.9560539751396244, + "language_loss": 0.86798179, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89193308, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.711049795150757 + }, + { + "auxiliary_loss_clip": 0.01202973, + "auxiliary_loss_mlp": 0.01192695, + "balance_loss_clip": 1.00973928, + "balance_loss_mlp": 1.00281811, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.9054473867219264, + "language_loss": 0.92596829, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.94992501, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 2.7273788452148438 + }, + { + "auxiliary_loss_clip": 0.01202762, + "auxiliary_loss_mlp": 0.011919, + "balance_loss_clip": 1.00971341, + "balance_loss_mlp": 1.00240517, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.208159476253555, + "language_loss": 0.99618489, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02013147, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.670692205429077 + }, + { + "auxiliary_loss_clip": 0.01202814, + "auxiliary_loss_mlp": 0.01192321, + "balance_loss_clip": 1.00980663, + "balance_loss_mlp": 1.00244451, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.4296947400539612, + "language_loss": 0.8808111, + "learning_rate": 2.625331386578098e-06, + "loss": 0.90476245, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.63246488571167 + }, + { + "auxiliary_loss_clip": 0.0120302, + "auxiliary_loss_mlp": 0.01191978, + "balance_loss_clip": 1.00991547, + "balance_loss_mlp": 1.00229251, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.0073566251747175, + "language_loss": 0.93394881, + "learning_rate": 2.63615268640451e-06, + "loss": 0.9578988, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.6663482189178467 + }, + { + "auxiliary_loss_clip": 0.01202803, + "auxiliary_loss_mlp": 0.01192184, + "balance_loss_clip": 1.00965405, + "balance_loss_mlp": 1.00249767, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 2.8469750275925327, + "language_loss": 0.89839077, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92234063, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.6847586631774902 + }, + { + "auxiliary_loss_clip": 0.012027, + "auxiliary_loss_mlp": 0.01192293, + "balance_loss_clip": 1.00956488, + "balance_loss_mlp": 1.00251186, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 1.8282338736151522, + "language_loss": 0.88295585, + "learning_rate": 2.657264485425803e-06, + "loss": 0.90690577, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.6891725063323975 + }, + { + "auxiliary_loss_clip": 0.01202534, + "auxiliary_loss_mlp": 0.0119214, + "balance_loss_clip": 1.00940657, + "balance_loss_mlp": 1.00254965, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.0398116978825636, + "language_loss": 0.96179187, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.98573864, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.623422384262085 + }, + { + "auxiliary_loss_clip": 0.01202811, + "auxiliary_loss_mlp": 0.01192165, + "balance_loss_clip": 1.0096817, + "balance_loss_mlp": 1.00267005, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.8573161669472538, + "language_loss": 0.98811698, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01206672, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.632704257965088 + }, + { + "auxiliary_loss_clip": 0.01202874, + "auxiliary_loss_mlp": 0.01191412, + "balance_loss_clip": 1.00972664, + "balance_loss_mlp": 1.00182116, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.404432003619453, + "language_loss": 0.85493696, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.87887979, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.622945547103882 + }, + { + "auxiliary_loss_clip": 0.01202579, + "auxiliary_loss_mlp": 0.01192111, + "balance_loss_clip": 1.00940847, + "balance_loss_mlp": 1.00242472, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 1.961358843134981, + "language_loss": 0.85287517, + "learning_rate": 2.697518353781685e-06, + "loss": 0.87682205, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.6376793384552 + }, + { + "auxiliary_loss_clip": 0.01202662, + "auxiliary_loss_mlp": 0.0119231, + "balance_loss_clip": 1.00946558, + "balance_loss_mlp": 1.00243366, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.2278444198643963, + "language_loss": 0.96208811, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.98603779, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.6583077907562256 + }, + { + "auxiliary_loss_clip": 0.01202293, + "auxiliary_loss_mlp": 0.01191679, + "balance_loss_clip": 1.00928116, + "balance_loss_mlp": 1.00227892, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 3.2280797169225464, + "language_loss": 0.94295657, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.96689636, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.642460584640503 + }, + { + "auxiliary_loss_clip": 0.01202434, + "auxiliary_loss_mlp": 0.01192058, + "balance_loss_clip": 1.00932384, + "balance_loss_mlp": 1.00256288, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.711389887455094, + "language_loss": 0.95715809, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98110294, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.623488187789917 + }, + { + "auxiliary_loss_clip": 0.01202342, + "auxiliary_loss_mlp": 0.01191909, + "balance_loss_clip": 1.00923657, + "balance_loss_mlp": 1.00241375, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.724628428747175, + "language_loss": 0.97938228, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00332475, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.6572577953338623 + }, + { + "auxiliary_loss_clip": 0.01202236, + "auxiliary_loss_mlp": 0.01191907, + "balance_loss_clip": 1.00918436, + "balance_loss_mlp": 1.00231647, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.83051321124314, + "language_loss": 0.93993711, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96387851, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.619446039199829 + }, + { + "auxiliary_loss_clip": 0.01207787, + "auxiliary_loss_mlp": 0.01201363, + "balance_loss_clip": 1.01467907, + "balance_loss_mlp": 1.01215386, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.420395363424701, + "language_loss": 0.65771711, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68180859, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.167431354522705 + }, + { + "auxiliary_loss_clip": 0.01207816, + "auxiliary_loss_mlp": 0.01201105, + "balance_loss_clip": 1.01469386, + "balance_loss_mlp": 1.01189566, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.243004859779477, + "language_loss": 0.63750184, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66159105, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.203794240951538 + }, + { + "auxiliary_loss_clip": 0.01202187, + "auxiliary_loss_mlp": 0.01192072, + "balance_loss_clip": 1.00911427, + "balance_loss_mlp": 1.00248122, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.138648459610634, + "language_loss": 0.85761565, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88155824, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.658661365509033 + }, + { + "auxiliary_loss_clip": 0.01202157, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_clip": 1.00899374, + "balance_loss_mlp": 1.00227714, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.093492029164208, + "language_loss": 0.969679, + "learning_rate": 2.779824149153005e-06, + "loss": 0.99361742, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.6462111473083496 + }, + { + "auxiliary_loss_clip": 0.01201828, + "auxiliary_loss_mlp": 0.01191567, + "balance_loss_clip": 1.00874972, + "balance_loss_mlp": 1.00207186, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.0862933515502062, + "language_loss": 0.87563765, + "learning_rate": 2.788352117317012e-06, + "loss": 0.8995716, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.619516611099243 + }, + { + "auxiliary_loss_clip": 0.01202029, + "auxiliary_loss_mlp": 0.01191855, + "balance_loss_clip": 1.00892246, + "balance_loss_mlp": 1.00255036, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.1641548111092996, + "language_loss": 0.91803139, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94197029, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.6884450912475586 + }, + { + "auxiliary_loss_clip": 0.01201881, + "auxiliary_loss_mlp": 0.01191846, + "balance_loss_clip": 1.00884473, + "balance_loss_mlp": 1.00254166, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.3835216984525407, + "language_loss": 0.91999191, + "learning_rate": 2.80507649095533e-06, + "loss": 0.9439292, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.645042896270752 + }, + { + "auxiliary_loss_clip": 0.01201723, + "auxiliary_loss_mlp": 0.01191704, + "balance_loss_clip": 1.00872362, + "balance_loss_mlp": 1.00230455, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.1345900730412413, + "language_loss": 0.82482278, + "learning_rate": 2.813278540517843e-06, + "loss": 0.84875703, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.6611053943634033 + }, + { + "auxiliary_loss_clip": 0.01201783, + "auxiliary_loss_mlp": 0.01191178, + "balance_loss_clip": 1.00861263, + "balance_loss_mlp": 1.00187421, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.3579816604062334, + "language_loss": 0.91198426, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.93591386, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.6654815673828125 + }, + { + "auxiliary_loss_clip": 0.01201749, + "auxiliary_loss_mlp": 0.01191872, + "balance_loss_clip": 1.00860751, + "balance_loss_mlp": 1.0022819, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.0790154711693947, + "language_loss": 0.94844282, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97237909, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.653749942779541 + }, + { + "auxiliary_loss_clip": 0.01201644, + "auxiliary_loss_mlp": 0.01191897, + "balance_loss_clip": 1.00855446, + "balance_loss_mlp": 1.00230694, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.6311585250818723, + "language_loss": 0.95863962, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.98257494, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.6428635120391846 + }, + { + "auxiliary_loss_clip": 0.01201476, + "auxiliary_loss_mlp": 0.01191631, + "balance_loss_clip": 1.00838518, + "balance_loss_mlp": 1.00213563, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.0780314457122904, + "language_loss": 0.86459392, + "learning_rate": 2.84508017388607e-06, + "loss": 0.88852501, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.682795763015747 + }, + { + "auxiliary_loss_clip": 0.01201409, + "auxiliary_loss_mlp": 0.01191909, + "balance_loss_clip": 1.00833631, + "balance_loss_mlp": 1.00241351, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.386023740576118, + "language_loss": 0.91828418, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94221735, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.6174230575561523 + }, + { + "auxiliary_loss_clip": 0.01206955, + "auxiliary_loss_mlp": 0.01200002, + "balance_loss_clip": 1.01384616, + "balance_loss_mlp": 1.01079261, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.3924345302701189, + "language_loss": 0.6251325, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.64920205, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.239703893661499 + }, + { + "auxiliary_loss_clip": 0.01201382, + "auxiliary_loss_mlp": 0.01191625, + "balance_loss_clip": 1.00827003, + "balance_loss_mlp": 1.0022254, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.8041509547034689, + "language_loss": 0.9073385, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93126857, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.7258927822113037 + }, + { + "auxiliary_loss_clip": 0.01201477, + "auxiliary_loss_mlp": 0.01191746, + "balance_loss_clip": 1.0083313, + "balance_loss_mlp": 1.00215542, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 4.261062490925349, + "language_loss": 0.82081127, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84474349, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.6513164043426514 + }, + { + "auxiliary_loss_clip": 0.0120141, + "auxiliary_loss_mlp": 0.01191485, + "balance_loss_clip": 1.00832617, + "balance_loss_mlp": 1.00208473, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.9473168451133904, + "language_loss": 0.95630765, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98023665, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.645108699798584 + }, + { + "auxiliary_loss_clip": 0.0120146, + "auxiliary_loss_mlp": 0.01191413, + "balance_loss_clip": 1.00834644, + "balance_loss_mlp": 1.00191796, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.2244745827791155, + "language_loss": 0.85814995, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88207865, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.6599338054656982 + }, + { + "auxiliary_loss_clip": 0.01201511, + "auxiliary_loss_mlp": 0.01191488, + "balance_loss_clip": 1.00837255, + "balance_loss_mlp": 1.00218415, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.179517846136811, + "language_loss": 0.91479445, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.9387244, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.6954662799835205 + }, + { + "auxiliary_loss_clip": 0.01201203, + "auxiliary_loss_mlp": 0.01191108, + "balance_loss_clip": 1.00812292, + "balance_loss_mlp": 1.00180328, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.3584496297441313, + "language_loss": 0.85773373, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88165683, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.6451663970947266 + }, + { + "auxiliary_loss_clip": 0.01201208, + "auxiliary_loss_mlp": 0.01191117, + "balance_loss_clip": 1.00810325, + "balance_loss_mlp": 1.00181258, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 3.1859102005927014, + "language_loss": 0.8665055, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89042878, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 4.163480043411255 + }, + { + "auxiliary_loss_clip": 0.01201227, + "auxiliary_loss_mlp": 0.0119187, + "balance_loss_clip": 1.00802648, + "balance_loss_mlp": 1.00208855, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.8740467342921168, + "language_loss": 0.91918141, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94311231, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 5.496267795562744 + }, + { + "auxiliary_loss_clip": 0.01201335, + "auxiliary_loss_mlp": 0.0119183, + "balance_loss_clip": 1.0081563, + "balance_loss_mlp": 1.00214362, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.0508437659177075, + "language_loss": 0.87272608, + "learning_rate": 2.925210265866963e-06, + "loss": 0.89665771, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.6326754093170166 + }, + { + "auxiliary_loss_clip": 0.01206523, + "auxiliary_loss_mlp": 0.01199252, + "balance_loss_clip": 1.01342893, + "balance_loss_mlp": 1.01004255, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3954404677546848, + "language_loss": 0.68143332, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70549107, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.040726900100708 + }, + { + "auxiliary_loss_clip": 0.0120108, + "auxiliary_loss_mlp": 0.01191612, + "balance_loss_clip": 1.00798237, + "balance_loss_mlp": 1.00240302, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 4.033518723807898, + "language_loss": 0.89963746, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92356437, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.6052966117858887 + }, + { + "auxiliary_loss_clip": 0.01201033, + "auxiliary_loss_mlp": 0.01191582, + "balance_loss_clip": 1.00786495, + "balance_loss_mlp": 1.00218248, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 3.01089895596475, + "language_loss": 0.89494026, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.91886646, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.6524975299835205 + }, + { + "auxiliary_loss_clip": 0.01200972, + "auxiliary_loss_mlp": 0.0119159, + "balance_loss_clip": 1.00779629, + "balance_loss_mlp": 1.00209475, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.084044965384674, + "language_loss": 0.7647348, + "learning_rate": 2.952041322436969e-06, + "loss": 0.78866041, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.6430673599243164 + }, + { + "auxiliary_loss_clip": 0.01206221, + "auxiliary_loss_mlp": 0.01198485, + "balance_loss_clip": 1.01312661, + "balance_loss_mlp": 1.00927591, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.036182267906669, + "language_loss": 0.6552639, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.67931092, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.2200140953063965 + }, + { + "auxiliary_loss_clip": 0.01200994, + "auxiliary_loss_mlp": 0.01191339, + "balance_loss_clip": 1.00794208, + "balance_loss_mlp": 1.00213015, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.0589544645280657, + "language_loss": 0.90720791, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93113124, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.6383185386657715 + }, + { + "auxiliary_loss_clip": 0.01201233, + "auxiliary_loss_mlp": 0.01191452, + "balance_loss_clip": 1.00803685, + "balance_loss_mlp": 1.00195742, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.096955608955785, + "language_loss": 0.91083223, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93475902, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.644968271255493 + }, + { + "auxiliary_loss_clip": 0.01201022, + "auxiliary_loss_mlp": 0.01191277, + "balance_loss_clip": 1.0079633, + "balance_loss_mlp": 1.00206792, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.042537035646387, + "language_loss": 0.90628099, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93020391, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.6627109050750732 + }, + { + "auxiliary_loss_clip": 0.01201054, + "auxiliary_loss_mlp": 0.01191166, + "balance_loss_clip": 1.00798988, + "balance_loss_mlp": 1.0017662, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.0385807029918985, + "language_loss": 0.87915373, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90307593, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.6434571743011475 + }, + { + "auxiliary_loss_clip": 0.01200894, + "auxiliary_loss_mlp": 0.01191159, + "balance_loss_clip": 1.00787234, + "balance_loss_mlp": 1.00204539, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 2.0072256781796893, + "language_loss": 0.93602401, + "learning_rate": 2.990301221458371e-06, + "loss": 0.95994449, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.608814001083374 + }, + { + "auxiliary_loss_clip": 0.01201001, + "auxiliary_loss_mlp": 0.01191169, + "balance_loss_clip": 1.00797892, + "balance_loss_mlp": 1.00215125, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.7521568900487865, + "language_loss": 0.96493173, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.98885345, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.6177661418914795 + }, + { + "auxiliary_loss_clip": 0.01201011, + "auxiliary_loss_mlp": 0.011914, + "balance_loss_clip": 1.0079205, + "balance_loss_mlp": 1.00190496, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.6783753299615634, + "language_loss": 0.87037599, + "learning_rate": 3.002565443382063e-06, + "loss": 0.8943001, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.6633269786834717 + }, + { + "auxiliary_loss_clip": 0.01200559, + "auxiliary_loss_mlp": 0.01191405, + "balance_loss_clip": 1.00752091, + "balance_loss_mlp": 1.00200522, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.806186124601827, + "language_loss": 0.83568347, + "learning_rate": 3.008611048208843e-06, + "loss": 0.85960317, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.611298084259033 + }, + { + "auxiliary_loss_clip": 0.012056, + "auxiliary_loss_mlp": 0.01196617, + "balance_loss_clip": 1.0125761, + "balance_loss_mlp": 1.00740802, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 1.0215574542018808, + "language_loss": 0.64793736, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67195952, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.202425956726074 + }, + { + "auxiliary_loss_clip": 0.01200526, + "auxiliary_loss_mlp": 0.01191008, + "balance_loss_clip": 1.00753593, + "balance_loss_mlp": 1.00179958, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.1334272417262232, + "language_loss": 0.97614169, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00005698, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.694645404815674 + }, + { + "auxiliary_loss_clip": 0.01200563, + "auxiliary_loss_mlp": 0.01191097, + "balance_loss_clip": 1.00764894, + "balance_loss_mlp": 1.0016973, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.6759626944562707, + "language_loss": 0.84073198, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86464858, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.6767418384552 + }, + { + "auxiliary_loss_clip": 0.01200471, + "auxiliary_loss_mlp": 0.01191417, + "balance_loss_clip": 1.00733447, + "balance_loss_mlp": 1.00230348, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.1716746320294753, + "language_loss": 0.82821876, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85213768, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.678135633468628 + }, + { + "auxiliary_loss_clip": 0.01200368, + "auxiliary_loss_mlp": 0.0119095, + "balance_loss_clip": 1.00735486, + "balance_loss_mlp": 1.00164521, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.1373743266171057, + "language_loss": 0.93797392, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96188712, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.6513071060180664 + }, + { + "auxiliary_loss_clip": 0.01200542, + "auxiliary_loss_mlp": 0.01190648, + "balance_loss_clip": 1.00752866, + "balance_loss_mlp": 1.00172544, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.553287278446391, + "language_loss": 0.79425991, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81817174, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.592958927154541 + }, + { + "auxiliary_loss_clip": 0.0120044, + "auxiliary_loss_mlp": 0.01190748, + "balance_loss_clip": 1.00744414, + "balance_loss_mlp": 1.00172937, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 1.9127315565434033, + "language_loss": 0.93413746, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95804936, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.6162118911743164 + }, + { + "auxiliary_loss_clip": 0.01200443, + "auxiliary_loss_mlp": 0.01191245, + "balance_loss_clip": 1.00735927, + "balance_loss_mlp": 1.00194049, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.11711675579608, + "language_loss": 0.94627267, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97018957, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.637697219848633 + }, + { + "auxiliary_loss_clip": 0.01200316, + "auxiliary_loss_mlp": 0.01190527, + "balance_loss_clip": 1.00723803, + "balance_loss_mlp": 1.00141358, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 2.682293792851984, + "language_loss": 0.81768036, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84158874, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.6112215518951416 + }, + { + "auxiliary_loss_clip": 0.01200029, + "auxiliary_loss_mlp": 0.01190838, + "balance_loss_clip": 1.00704241, + "balance_loss_mlp": 1.00172436, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.5883607282231083, + "language_loss": 0.87956309, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90347171, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.664496421813965 + }, + { + "auxiliary_loss_clip": 0.01200116, + "auxiliary_loss_mlp": 0.01190936, + "balance_loss_clip": 1.00716352, + "balance_loss_mlp": 1.00172687, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 3.0173799714237304, + "language_loss": 0.84534299, + "learning_rate": 3.071615712271274e-06, + "loss": 0.86925352, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.6027815341949463 + }, + { + "auxiliary_loss_clip": 0.01200275, + "auxiliary_loss_mlp": 0.01191601, + "balance_loss_clip": 1.00723863, + "balance_loss_mlp": 1.00248694, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.552429497376284, + "language_loss": 0.99100894, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01492774, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.603881359100342 + }, + { + "auxiliary_loss_clip": 0.01200273, + "auxiliary_loss_mlp": 0.01190956, + "balance_loss_clip": 1.00711322, + "balance_loss_mlp": 1.00136518, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.6727925147499887, + "language_loss": 0.88973445, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91364676, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.6098289489746094 + }, + { + "auxiliary_loss_clip": 0.01200097, + "auxiliary_loss_mlp": 0.01190656, + "balance_loss_clip": 1.00707674, + "balance_loss_mlp": 1.0014472, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.8148577615076567, + "language_loss": 0.93443543, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95834291, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.64154052734375 + }, + { + "auxiliary_loss_clip": 0.01200149, + "auxiliary_loss_mlp": 0.01191276, + "balance_loss_clip": 1.00713205, + "balance_loss_mlp": 1.00206685, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.4262661337658744, + "language_loss": 0.90208495, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92599916, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.5782268047332764 + }, + { + "auxiliary_loss_clip": 0.01199932, + "auxiliary_loss_mlp": 0.01190861, + "balance_loss_clip": 1.0069418, + "balance_loss_mlp": 1.0018425, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.684983673527806, + "language_loss": 0.92313319, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.94704109, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.6325604915618896 + }, + { + "auxiliary_loss_clip": 0.01199844, + "auxiliary_loss_mlp": 0.01190856, + "balance_loss_clip": 1.00676358, + "balance_loss_mlp": 1.001647, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.173596595646119, + "language_loss": 0.71026856, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73417556, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.7019622325897217 + }, + { + "auxiliary_loss_clip": 0.01199763, + "auxiliary_loss_mlp": 0.01190413, + "balance_loss_clip": 1.00682557, + "balance_loss_mlp": 1.00139475, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 6.593297031119356, + "language_loss": 0.88230956, + "learning_rate": 3.108720342404542e-06, + "loss": 0.90621126, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.676427125930786 + }, + { + "auxiliary_loss_clip": 0.01199855, + "auxiliary_loss_mlp": 0.01191099, + "balance_loss_clip": 1.00688791, + "balance_loss_mlp": 1.00188971, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 3.29702651752245, + "language_loss": 0.82168567, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84559524, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.6365280151367188 + }, + { + "auxiliary_loss_clip": 0.01199794, + "auxiliary_loss_mlp": 0.0119104, + "balance_loss_clip": 1.0067302, + "balance_loss_mlp": 1.0019263, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 2.7358669370243804, + "language_loss": 0.67045784, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69436616, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.616305112838745 + }, + { + "auxiliary_loss_clip": 0.01199849, + "auxiliary_loss_mlp": 0.01190677, + "balance_loss_clip": 1.00689387, + "balance_loss_mlp": 1.00156319, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 2.0014288961444335, + "language_loss": 0.88246399, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90636927, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.659189224243164 + }, + { + "auxiliary_loss_clip": 0.01199737, + "auxiliary_loss_mlp": 0.01191157, + "balance_loss_clip": 1.00673962, + "balance_loss_mlp": 1.0019474, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.6834776652537329, + "language_loss": 0.84641248, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87032139, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.6301629543304443 + }, + { + "auxiliary_loss_clip": 0.01199724, + "auxiliary_loss_mlp": 0.01190762, + "balance_loss_clip": 1.00682235, + "balance_loss_mlp": 1.0015533, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.0065743728246175, + "language_loss": 0.97400331, + "learning_rate": 3.133972684206866e-06, + "loss": 0.99790812, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 4.095647811889648 + }, + { + "auxiliary_loss_clip": 0.01199636, + "auxiliary_loss_mlp": 0.01190698, + "balance_loss_clip": 1.00662637, + "balance_loss_mlp": 1.0016799, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1918440321694503, + "language_loss": 0.82544947, + "learning_rate": 3.138906441556014e-06, + "loss": 0.84935284, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 5.5164453983306885 + }, + { + "auxiliary_loss_clip": 0.01199794, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_clip": 1.00674891, + "balance_loss_mlp": 1.00171518, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 4.111373960910686, + "language_loss": 0.82840133, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85230756, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.6667919158935547 + }, + { + "auxiliary_loss_clip": 0.01199484, + "auxiliary_loss_mlp": 0.01190677, + "balance_loss_clip": 1.00652003, + "balance_loss_mlp": 1.00165856, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.326083058136395, + "language_loss": 0.95413256, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.97803414, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.5819263458251953 + }, + { + "auxiliary_loss_clip": 0.01199534, + "auxiliary_loss_mlp": 0.01190454, + "balance_loss_clip": 1.00660872, + "balance_loss_mlp": 1.00162649, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.866442635379609, + "language_loss": 0.73171729, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7556172, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.692584991455078 + }, + { + "auxiliary_loss_clip": 0.01199399, + "auxiliary_loss_mlp": 0.01190718, + "balance_loss_clip": 1.00640512, + "balance_loss_mlp": 1.00169945, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.010956210380501, + "language_loss": 0.88909245, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91299361, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.589648723602295 + }, + { + "auxiliary_loss_clip": 0.01199354, + "auxiliary_loss_mlp": 0.011906, + "balance_loss_clip": 1.00644517, + "balance_loss_mlp": 1.00167692, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.037249939810622, + "language_loss": 0.88981903, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91371852, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.6030147075653076 + }, + { + "auxiliary_loss_clip": 0.01199488, + "auxiliary_loss_mlp": 0.01190419, + "balance_loss_clip": 1.00644815, + "balance_loss_mlp": 1.00149572, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.209650133196753, + "language_loss": 0.84008718, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86398625, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.609757900238037 + }, + { + "auxiliary_loss_clip": 0.01199399, + "auxiliary_loss_mlp": 0.01190453, + "balance_loss_clip": 1.00643682, + "balance_loss_mlp": 1.00153005, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.8191202166019833, + "language_loss": 0.90057784, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92447639, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.658411979675293 + }, + { + "auxiliary_loss_clip": 0.0119919, + "auxiliary_loss_mlp": 0.01190469, + "balance_loss_clip": 1.00627065, + "balance_loss_mlp": 1.00145125, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.3637608604934184, + "language_loss": 0.91141045, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93530709, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.6562001705169678 + }, + { + "auxiliary_loss_clip": 0.01199385, + "auxiliary_loss_mlp": 0.01190649, + "balance_loss_clip": 1.00642514, + "balance_loss_mlp": 1.00153542, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.466379122451804, + "language_loss": 0.85755312, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88145357, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.666461706161499 + }, + { + "auxiliary_loss_clip": 0.01199391, + "auxiliary_loss_mlp": 0.01190794, + "balance_loss_clip": 1.00645018, + "balance_loss_mlp": 1.00168085, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.5040210507371694, + "language_loss": 0.84277856, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86668038, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.5790324211120605 + }, + { + "auxiliary_loss_clip": 0.01199259, + "auxiliary_loss_mlp": 0.01190832, + "balance_loss_clip": 1.00628471, + "balance_loss_mlp": 1.00162303, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 3.130818220739862, + "language_loss": 0.81293988, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83684075, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.5788283348083496 + }, + { + "auxiliary_loss_clip": 0.01202933, + "auxiliary_loss_mlp": 0.0119405, + "balance_loss_clip": 1.01038957, + "balance_loss_mlp": 1.00560439, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0391100416352914, + "language_loss": 0.66871369, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69268346, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.276627540588379 + }, + { + "auxiliary_loss_clip": 0.01199182, + "auxiliary_loss_mlp": 0.0119053, + "balance_loss_clip": 1.00628352, + "balance_loss_mlp": 1.00160718, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2925982620619942, + "language_loss": 0.83963883, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86353594, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.6127641201019287 + }, + { + "auxiliary_loss_clip": 0.01199197, + "auxiliary_loss_mlp": 0.01190364, + "balance_loss_clip": 1.0062741, + "balance_loss_mlp": 1.00134587, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.0872293674161106, + "language_loss": 0.88549411, + "learning_rate": 3.204280886775619e-06, + "loss": 0.90938967, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.683446168899536 + }, + { + "auxiliary_loss_clip": 0.01199097, + "auxiliary_loss_mlp": 0.01190946, + "balance_loss_clip": 1.00605369, + "balance_loss_mlp": 1.00183272, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 4.33739478636632, + "language_loss": 0.86120844, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88510883, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.6739394664764404 + }, + { + "auxiliary_loss_clip": 0.01202549, + "auxiliary_loss_mlp": 0.01193758, + "balance_loss_clip": 1.01017559, + "balance_loss_mlp": 1.00531149, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.857072442341236, + "language_loss": 0.60138321, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62534618, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.197172164916992 + }, + { + "auxiliary_loss_clip": 0.01199221, + "auxiliary_loss_mlp": 0.01190399, + "balance_loss_clip": 1.00628722, + "balance_loss_mlp": 1.00157166, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 2.057834405714868, + "language_loss": 0.84633428, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87023044, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.6469905376434326 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01190627, + "balance_loss_clip": 1.00606906, + "balance_loss_mlp": 1.00199008, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.1968399161941408, + "language_loss": 0.88709635, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91099155, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.6381239891052246 + }, + { + "auxiliary_loss_clip": 0.01199086, + "auxiliary_loss_mlp": 0.01190309, + "balance_loss_clip": 1.00620222, + "balance_loss_mlp": 1.00129044, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.200210350818915, + "language_loss": 0.92858887, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95248282, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.7012534141540527 + }, + { + "auxiliary_loss_clip": 0.01198772, + "auxiliary_loss_mlp": 0.01190154, + "balance_loss_clip": 1.00583339, + "balance_loss_mlp": 1.00142241, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.8481658238071987, + "language_loss": 0.7426824, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76657164, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.784473419189453 + }, + { + "auxiliary_loss_clip": 0.01198985, + "auxiliary_loss_mlp": 0.01190291, + "balance_loss_clip": 1.0061307, + "balance_loss_mlp": 1.00155926, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.4072393513962775, + "language_loss": 0.88233161, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90622437, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.614840269088745 + }, + { + "auxiliary_loss_clip": 0.01198982, + "auxiliary_loss_mlp": 0.01189947, + "balance_loss_clip": 1.0061276, + "balance_loss_mlp": 1.00130999, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.8418205874096114, + "language_loss": 0.84025538, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86414468, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.6108968257904053 + }, + { + "auxiliary_loss_clip": 0.01199006, + "auxiliary_loss_mlp": 0.01190603, + "balance_loss_clip": 1.00612056, + "balance_loss_mlp": 1.0017755, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9119821885650778, + "language_loss": 0.89864933, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92254543, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.696056365966797 + }, + { + "auxiliary_loss_clip": 0.01198858, + "auxiliary_loss_mlp": 0.01190986, + "balance_loss_clip": 1.00603759, + "balance_loss_mlp": 1.00225425, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.4127544457407666, + "language_loss": 0.89779234, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92169082, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.7236199378967285 + }, + { + "auxiliary_loss_clip": 0.01198824, + "auxiliary_loss_mlp": 0.01190239, + "balance_loss_clip": 1.00596476, + "balance_loss_mlp": 1.00160241, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 4.688099990419057, + "language_loss": 0.86788601, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89177656, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.6091766357421875 + }, + { + "auxiliary_loss_clip": 0.01198928, + "auxiliary_loss_mlp": 0.0119029, + "balance_loss_clip": 1.00603557, + "balance_loss_mlp": 1.00136685, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 3.0010527034584347, + "language_loss": 0.99563146, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.01952362, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.5947012901306152 + }, + { + "auxiliary_loss_clip": 0.01198886, + "auxiliary_loss_mlp": 0.01190715, + "balance_loss_clip": 1.00611091, + "balance_loss_mlp": 1.0018872, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.261967829847258, + "language_loss": 0.88456476, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.90846074, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.650475025177002 + }, + { + "auxiliary_loss_clip": 0.01198681, + "auxiliary_loss_mlp": 0.0119046, + "balance_loss_clip": 1.00582731, + "balance_loss_mlp": 1.00182343, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.3934327288665496, + "language_loss": 0.86387414, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.88776565, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.5820889472961426 + }, + { + "auxiliary_loss_clip": 0.01198394, + "auxiliary_loss_mlp": 0.01190581, + "balance_loss_clip": 1.00567389, + "balance_loss_mlp": 1.00175309, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.7301590134749216, + "language_loss": 0.86647916, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89036888, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.6513631343841553 + }, + { + "auxiliary_loss_clip": 0.01198573, + "auxiliary_loss_mlp": 0.01190609, + "balance_loss_clip": 1.0057627, + "balance_loss_mlp": 1.00178123, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.5469336615780587, + "language_loss": 0.91520047, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.93909228, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.590458869934082 + }, + { + "auxiliary_loss_clip": 0.01198728, + "auxiliary_loss_mlp": 0.01190126, + "balance_loss_clip": 1.00585938, + "balance_loss_mlp": 1.0014894, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 4.56964334269056, + "language_loss": 0.91569555, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.93958408, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.6667025089263916 + }, + { + "auxiliary_loss_clip": 0.01201529, + "auxiliary_loss_mlp": 0.01191149, + "balance_loss_clip": 1.00925446, + "balance_loss_mlp": 1.00346553, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1763795901985143, + "language_loss": 0.7236281, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74755496, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.1333625316619873 + }, + { + "auxiliary_loss_clip": 0.01198587, + "auxiliary_loss_mlp": 0.01190334, + "balance_loss_clip": 1.00585091, + "balance_loss_mlp": 1.00150621, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.14952702368209, + "language_loss": 0.84573156, + "learning_rate": 3.283560135133457e-06, + "loss": 0.8696208, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.6158103942871094 + }, + { + "auxiliary_loss_clip": 0.01198361, + "auxiliary_loss_mlp": 0.01189879, + "balance_loss_clip": 1.00561666, + "balance_loss_mlp": 1.00124252, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.0538647223842332, + "language_loss": 0.8910253, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91490769, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.6782002449035645 + }, + { + "auxiliary_loss_clip": 0.01198401, + "auxiliary_loss_mlp": 0.01190364, + "balance_loss_clip": 1.00565147, + "balance_loss_mlp": 1.0017271, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.795717661507609, + "language_loss": 0.79983723, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82372487, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.696709156036377 + }, + { + "auxiliary_loss_clip": 0.01198352, + "auxiliary_loss_mlp": 0.01190454, + "balance_loss_clip": 1.00559533, + "balance_loss_mlp": 1.00162613, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 3.0659850984831034, + "language_loss": 0.91708744, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94097543, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.7296512126922607 + }, + { + "auxiliary_loss_clip": 0.01198366, + "auxiliary_loss_mlp": 0.01190177, + "balance_loss_clip": 1.00560868, + "balance_loss_mlp": 1.00173068, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.3090794774459815, + "language_loss": 0.90514934, + "learning_rate": 3.299075396334735e-06, + "loss": 0.92903483, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 4.093288421630859 + }, + { + "auxiliary_loss_clip": 0.0119832, + "auxiliary_loss_mlp": 0.0118976, + "balance_loss_clip": 1.00559139, + "balance_loss_mlp": 1.00121832, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.9121545603409464, + "language_loss": 0.8706888, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89456952, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 5.539374828338623 + }, + { + "auxiliary_loss_clip": 0.01198187, + "auxiliary_loss_mlp": 0.01189928, + "balance_loss_clip": 1.00552928, + "balance_loss_mlp": 1.00119615, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 2.0997143808410126, + "language_loss": 0.84514391, + "learning_rate": 3.306695037731344e-06, + "loss": 0.86902505, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.700817346572876 + }, + { + "auxiliary_loss_clip": 0.01198316, + "auxiliary_loss_mlp": 0.01190521, + "balance_loss_clip": 1.00551295, + "balance_loss_mlp": 1.00197911, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 1.906289250262873, + "language_loss": 0.89781672, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92170507, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.7148685455322266 + }, + { + "auxiliary_loss_clip": 0.01198519, + "auxiliary_loss_mlp": 0.01190146, + "balance_loss_clip": 1.00569272, + "balance_loss_mlp": 1.00131822, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.8979181591742158, + "language_loss": 0.8893429, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91322947, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.6664233207702637 + }, + { + "auxiliary_loss_clip": 0.01198147, + "auxiliary_loss_mlp": 0.0118995, + "balance_loss_clip": 1.00552559, + "balance_loss_mlp": 1.00150394, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.807649892204101, + "language_loss": 0.81024027, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83412123, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.6867825984954834 + }, + { + "auxiliary_loss_clip": 0.01198436, + "auxiliary_loss_mlp": 0.01189938, + "balance_loss_clip": 1.00570703, + "balance_loss_mlp": 1.00130129, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.762907275207099, + "language_loss": 0.8237524, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.8476361, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.663534164428711 + }, + { + "auxiliary_loss_clip": 0.01198371, + "auxiliary_loss_mlp": 0.01190143, + "balance_loss_clip": 1.00562966, + "balance_loss_mlp": 1.0016017, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.966836229155674, + "language_loss": 0.72632444, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75020963, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.672105073928833 + }, + { + "auxiliary_loss_clip": 0.01198289, + "auxiliary_loss_mlp": 0.01190107, + "balance_loss_clip": 1.00564337, + "balance_loss_mlp": 1.0016613, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 3.1186703345318665, + "language_loss": 0.98014528, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00402927, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.6062402725219727 + }, + { + "auxiliary_loss_clip": 0.0119823, + "auxiliary_loss_mlp": 0.01190082, + "balance_loss_clip": 1.00555611, + "balance_loss_mlp": 1.00173151, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.2483104646991636, + "language_loss": 0.76766884, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79155201, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.6136744022369385 + }, + { + "auxiliary_loss_clip": 0.01198319, + "auxiliary_loss_mlp": 0.01189764, + "balance_loss_clip": 1.00557828, + "balance_loss_mlp": 1.001127, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.896959744067055, + "language_loss": 0.76748353, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79136431, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.609438180923462 + }, + { + "auxiliary_loss_clip": 0.01198376, + "auxiliary_loss_mlp": 0.01189525, + "balance_loss_clip": 1.00569487, + "balance_loss_mlp": 1.00117469, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.442998167749942, + "language_loss": 0.84087408, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86475313, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.615143060684204 + }, + { + "auxiliary_loss_clip": 0.01198057, + "auxiliary_loss_mlp": 0.01190015, + "balance_loss_clip": 1.00538158, + "balance_loss_mlp": 1.00137782, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.0931605756157188, + "language_loss": 0.83772111, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86160189, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.724703550338745 + }, + { + "auxiliary_loss_clip": 0.0119809, + "auxiliary_loss_mlp": 0.01189926, + "balance_loss_clip": 1.00546336, + "balance_loss_mlp": 1.00138497, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 2.3624305638781844, + "language_loss": 0.77528024, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.79916036, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.6676957607269287 + }, + { + "auxiliary_loss_clip": 0.01198228, + "auxiliary_loss_mlp": 0.01190539, + "balance_loss_clip": 1.00540209, + "balance_loss_mlp": 1.00199723, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 2.698627133322591, + "language_loss": 0.76213026, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78601795, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.735076665878296 + }, + { + "auxiliary_loss_clip": 0.01198171, + "auxiliary_loss_mlp": 0.01190058, + "balance_loss_clip": 1.00548148, + "balance_loss_mlp": 1.00170767, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.471463330683513, + "language_loss": 0.87393343, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.8978157, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.623208999633789 + }, + { + "auxiliary_loss_clip": 0.01198276, + "auxiliary_loss_mlp": 0.01189802, + "balance_loss_clip": 1.00558114, + "balance_loss_mlp": 1.00135565, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.754092732694803, + "language_loss": 0.86584145, + "learning_rate": 3.357647774369736e-06, + "loss": 0.88972229, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.6404924392700195 + }, + { + "auxiliary_loss_clip": 0.01198102, + "auxiliary_loss_mlp": 0.01189848, + "balance_loss_clip": 1.00544214, + "balance_loss_mlp": 1.00149679, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.767965621250593, + "language_loss": 0.83623916, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86011869, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.6714179515838623 + }, + { + "auxiliary_loss_clip": 0.01198057, + "auxiliary_loss_mlp": 0.01189931, + "balance_loss_clip": 1.00534964, + "balance_loss_mlp": 1.00129414, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.38026199032947, + "language_loss": 0.71159697, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73547685, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.6530380249023438 + }, + { + "auxiliary_loss_clip": 0.01198128, + "auxiliary_loss_mlp": 0.0118975, + "balance_loss_clip": 1.00545239, + "balance_loss_mlp": 1.00120914, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 4.126379399950724, + "language_loss": 1.01981854, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04369736, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.5721492767333984 + }, + { + "auxiliary_loss_clip": 0.0119796, + "auxiliary_loss_mlp": 0.01189586, + "balance_loss_clip": 1.00536883, + "balance_loss_mlp": 1.00123572, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.7396413051853936, + "language_loss": 0.75081444, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77468991, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.8054957389831543 + }, + { + "auxiliary_loss_clip": 0.01201199, + "auxiliary_loss_mlp": 0.0119014, + "balance_loss_clip": 1.00915051, + "balance_loss_mlp": 1.00245667, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7559583218227123, + "language_loss": 0.56228459, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58619797, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.2270450592041016 + }, + { + "auxiliary_loss_clip": 0.01197812, + "auxiliary_loss_mlp": 0.01190028, + "balance_loss_clip": 1.00513065, + "balance_loss_mlp": 1.00167704, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.5541447816904053, + "language_loss": 0.94608021, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.96995854, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.688849687576294 + }, + { + "auxiliary_loss_clip": 0.01197872, + "auxiliary_loss_mlp": 0.01189481, + "balance_loss_clip": 1.00523543, + "balance_loss_mlp": 1.00122547, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.849490939266682, + "language_loss": 0.84495056, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.86882412, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.606414318084717 + }, + { + "auxiliary_loss_clip": 0.01197777, + "auxiliary_loss_mlp": 0.01190348, + "balance_loss_clip": 1.00515401, + "balance_loss_mlp": 1.00199759, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 1.8547209945168457, + "language_loss": 0.91549593, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93937719, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.6706087589263916 + }, + { + "auxiliary_loss_clip": 0.01197688, + "auxiliary_loss_mlp": 0.011896, + "balance_loss_clip": 1.00505626, + "balance_loss_mlp": 1.00124943, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.060179885597712, + "language_loss": 0.86850679, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89237964, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.6860241889953613 + }, + { + "auxiliary_loss_clip": 0.0119784, + "auxiliary_loss_mlp": 0.01189576, + "balance_loss_clip": 1.00523508, + "balance_loss_mlp": 1.00122523, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 3.6380369076617907, + "language_loss": 0.92374909, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94762319, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.687612295150757 + }, + { + "auxiliary_loss_clip": 0.01197607, + "auxiliary_loss_mlp": 0.01189676, + "balance_loss_clip": 1.00504637, + "balance_loss_mlp": 1.00123012, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.348961642975206, + "language_loss": 0.89699268, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92086554, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.5827648639678955 + }, + { + "auxiliary_loss_clip": 0.01197675, + "auxiliary_loss_mlp": 0.0119024, + "balance_loss_clip": 1.005054, + "balance_loss_mlp": 1.00179362, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 2.5808527295196364, + "language_loss": 0.85962701, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88350618, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.5943241119384766 + }, + { + "auxiliary_loss_clip": 0.0119747, + "auxiliary_loss_mlp": 0.01189783, + "balance_loss_clip": 1.00488234, + "balance_loss_mlp": 1.00143266, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.561121790041778, + "language_loss": 0.93115336, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95502585, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.6290993690490723 + }, + { + "auxiliary_loss_clip": 0.01197742, + "auxiliary_loss_mlp": 0.01189796, + "balance_loss_clip": 1.00514388, + "balance_loss_mlp": 1.00144553, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.145314971406138, + "language_loss": 0.78904444, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81291986, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.7105000019073486 + }, + { + "auxiliary_loss_clip": 0.01197682, + "auxiliary_loss_mlp": 0.01189591, + "balance_loss_clip": 1.00513351, + "balance_loss_mlp": 1.00162172, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.8564082716374124, + "language_loss": 0.8833403, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90721303, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.6043925285339355 + }, + { + "auxiliary_loss_clip": 0.01197545, + "auxiliary_loss_mlp": 0.01189885, + "balance_loss_clip": 1.0049448, + "balance_loss_mlp": 1.0015341, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.9137005503830018, + "language_loss": 0.81217206, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83604634, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.6633834838867188 + }, + { + "auxiliary_loss_clip": 0.01197585, + "auxiliary_loss_mlp": 0.01189706, + "balance_loss_clip": 1.00498521, + "balance_loss_mlp": 1.00126028, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.0165931435509092, + "language_loss": 0.87979591, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90366876, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.575817346572876 + }, + { + "auxiliary_loss_clip": 0.01197448, + "auxiliary_loss_mlp": 0.01189594, + "balance_loss_clip": 1.00497532, + "balance_loss_mlp": 1.00143373, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.697475219855711, + "language_loss": 0.84067023, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.8645407, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.7061312198638916 + }, + { + "auxiliary_loss_clip": 0.01197295, + "auxiliary_loss_mlp": 0.01189714, + "balance_loss_clip": 1.00483811, + "balance_loss_mlp": 1.00145841, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9579100296639158, + "language_loss": 0.89958209, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92345214, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.6024253368377686 + }, + { + "auxiliary_loss_clip": 0.01200331, + "auxiliary_loss_mlp": 0.01190241, + "balance_loss_clip": 1.00846565, + "balance_loss_mlp": 1.00255799, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0154522859324178, + "language_loss": 0.61264563, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63655132, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.0984416007995605 + }, + { + "auxiliary_loss_clip": 0.01197438, + "auxiliary_loss_mlp": 0.01189855, + "balance_loss_clip": 1.00492668, + "balance_loss_mlp": 1.0015049, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.1648234950785734, + "language_loss": 0.91249788, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93637085, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.642822265625 + }, + { + "auxiliary_loss_clip": 0.01197459, + "auxiliary_loss_mlp": 0.01189942, + "balance_loss_clip": 1.00495982, + "balance_loss_mlp": 1.00168693, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.102424719439338, + "language_loss": 0.89250934, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91638339, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.706052780151367 + }, + { + "auxiliary_loss_clip": 0.01197455, + "auxiliary_loss_mlp": 0.01189679, + "balance_loss_clip": 1.00489271, + "balance_loss_mlp": 1.00132895, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.7212725504413022, + "language_loss": 0.95759249, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98146379, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 4.156743288040161 + }, + { + "auxiliary_loss_clip": 0.01197277, + "auxiliary_loss_mlp": 0.01190028, + "balance_loss_clip": 1.00472176, + "balance_loss_mlp": 1.00167751, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.0603781116594115, + "language_loss": 0.75958872, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78346175, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 4.2019288539886475 + }, + { + "auxiliary_loss_clip": 0.01197203, + "auxiliary_loss_mlp": 0.01189401, + "balance_loss_clip": 1.00477815, + "balance_loss_mlp": 1.00133657, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.591411718589031, + "language_loss": 0.98636693, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01023293, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.6556944847106934 + }, + { + "auxiliary_loss_clip": 0.01197282, + "auxiliary_loss_mlp": 0.01189718, + "balance_loss_clip": 1.00484252, + "balance_loss_mlp": 1.0015583, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 3.468130561031844, + "language_loss": 0.8528983, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87676835, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.7751216888427734 + }, + { + "auxiliary_loss_clip": 0.01197171, + "auxiliary_loss_mlp": 0.01189781, + "balance_loss_clip": 1.00472307, + "balance_loss_mlp": 1.00162137, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.181169328249913, + "language_loss": 0.97075677, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99462634, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.634891986846924 + }, + { + "auxiliary_loss_clip": 0.01197349, + "auxiliary_loss_mlp": 0.01189709, + "balance_loss_clip": 1.00489259, + "balance_loss_mlp": 1.00164473, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.2462249729313783, + "language_loss": 0.95162439, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97549498, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.5974619388580322 + }, + { + "auxiliary_loss_clip": 0.01197093, + "auxiliary_loss_mlp": 0.01189467, + "balance_loss_clip": 1.00467539, + "balance_loss_mlp": 1.00159311, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.9561646758499407, + "language_loss": 0.76150423, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78536981, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.8059518337249756 + }, + { + "auxiliary_loss_clip": 0.0119714, + "auxiliary_loss_mlp": 0.01189703, + "balance_loss_clip": 1.00465822, + "balance_loss_mlp": 1.00154269, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 2.316409248591858, + "language_loss": 0.86454457, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.88841295, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.597541570663452 + }, + { + "auxiliary_loss_clip": 0.01196998, + "auxiliary_loss_mlp": 0.0118972, + "balance_loss_clip": 1.00462639, + "balance_loss_mlp": 1.00184631, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3417702019874342, + "language_loss": 0.77433729, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79820454, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.6373040676116943 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01189671, + "balance_loss_clip": 1.00471985, + "balance_loss_mlp": 1.0016067, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.433535125301677, + "language_loss": 0.90233815, + "learning_rate": 3.460884739729461e-06, + "loss": 0.92620599, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.6656906604766846 + }, + { + "auxiliary_loss_clip": 0.01197012, + "auxiliary_loss_mlp": 0.01189584, + "balance_loss_clip": 1.00458384, + "balance_loss_mlp": 1.00142431, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.4254964138730406, + "language_loss": 0.9345957, + "learning_rate": 3.463858658104523e-06, + "loss": 0.95846176, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.6524658203125 + }, + { + "auxiliary_loss_clip": 0.0119707, + "auxiliary_loss_mlp": 0.01189126, + "balance_loss_clip": 1.00467157, + "balance_loss_mlp": 1.00115693, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9068951908459544, + "language_loss": 0.93492019, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.95878208, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.6148242950439453 + }, + { + "auxiliary_loss_clip": 0.01196992, + "auxiliary_loss_mlp": 0.01189437, + "balance_loss_clip": 1.00464821, + "balance_loss_mlp": 1.00156319, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 2.747044631489437, + "language_loss": 0.86090922, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88477355, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.628662347793579 + }, + { + "auxiliary_loss_clip": 0.01196846, + "auxiliary_loss_mlp": 0.01189268, + "balance_loss_clip": 1.00448096, + "balance_loss_mlp": 1.00120366, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 2.1034572109364604, + "language_loss": 0.87421554, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.89807665, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.696949005126953 + }, + { + "auxiliary_loss_clip": 0.01196906, + "auxiliary_loss_mlp": 0.01189526, + "balance_loss_clip": 1.00450921, + "balance_loss_mlp": 1.00174797, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 2.401755746557476, + "language_loss": 0.86488867, + "learning_rate": 3.475618842282164e-06, + "loss": 0.888753, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.5877034664154053 + }, + { + "auxiliary_loss_clip": 0.01196865, + "auxiliary_loss_mlp": 0.01189527, + "balance_loss_clip": 1.00447512, + "balance_loss_mlp": 1.00165296, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.067562872643274, + "language_loss": 0.92257464, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94643855, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.5865538120269775 + }, + { + "auxiliary_loss_clip": 0.01196865, + "auxiliary_loss_mlp": 0.01189483, + "balance_loss_clip": 1.00458717, + "balance_loss_mlp": 1.00141859, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.3737670969591966, + "language_loss": 0.95597899, + "learning_rate": 3.481419351635897e-06, + "loss": 0.97984248, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.584205150604248 + }, + { + "auxiliary_loss_clip": 0.01196834, + "auxiliary_loss_mlp": 0.01189577, + "balance_loss_clip": 1.00454807, + "balance_loss_mlp": 1.00151229, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5795570267434087, + "language_loss": 0.8857528, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90961695, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.574643611907959 + }, + { + "auxiliary_loss_clip": 0.01196721, + "auxiliary_loss_mlp": 0.01189184, + "balance_loss_clip": 1.00438261, + "balance_loss_mlp": 1.00140524, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 10.573369995937405, + "language_loss": 0.89197451, + "learning_rate": 3.487168070036317e-06, + "loss": 0.91583353, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.583505392074585 + }, + { + "auxiliary_loss_clip": 0.01196659, + "auxiliary_loss_mlp": 0.0118959, + "balance_loss_clip": 1.00436473, + "balance_loss_mlp": 1.00152564, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 1.9621283212650134, + "language_loss": 0.98948479, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01334739, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.6078126430511475 + }, + { + "auxiliary_loss_clip": 0.0119688, + "auxiliary_loss_mlp": 0.01189611, + "balance_loss_clip": 1.0045135, + "balance_loss_mlp": 1.00154614, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 3.332101895097958, + "language_loss": 0.9103117, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93417656, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.61423659324646 + }, + { + "auxiliary_loss_clip": 0.01199725, + "auxiliary_loss_mlp": 0.01188509, + "balance_loss_clip": 1.00824118, + "balance_loss_mlp": 1.0015893, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9538490339361808, + "language_loss": 0.57668674, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.60056907, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.2443625926971436 + }, + { + "auxiliary_loss_clip": 0.01196459, + "auxiliary_loss_mlp": 0.01189041, + "balance_loss_clip": 1.00420761, + "balance_loss_mlp": 1.0012629, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.7493879500372818, + "language_loss": 0.87750059, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90135562, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.612797975540161 + }, + { + "auxiliary_loss_clip": 0.01196747, + "auxiliary_loss_mlp": 0.01189518, + "balance_loss_clip": 1.00439107, + "balance_loss_mlp": 1.00145328, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 3.53435477846783, + "language_loss": 0.84121656, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86507928, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.6221933364868164 + }, + { + "auxiliary_loss_clip": 0.01196766, + "auxiliary_loss_mlp": 0.01189565, + "balance_loss_clip": 1.00440598, + "balance_loss_mlp": 1.0016917, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 2.254078744575713, + "language_loss": 0.90480918, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92867255, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.599743366241455 + }, + { + "auxiliary_loss_clip": 0.01196712, + "auxiliary_loss_mlp": 0.01189265, + "balance_loss_clip": 1.00445843, + "balance_loss_mlp": 1.00129604, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.1594005689640943, + "language_loss": 0.83621275, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86007261, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.6328909397125244 + }, + { + "auxiliary_loss_clip": 0.01196579, + "auxiliary_loss_mlp": 0.01189264, + "balance_loss_clip": 1.00424027, + "balance_loss_mlp": 1.00119901, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.999489350761657, + "language_loss": 0.74054801, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76440638, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.5842058658599854 + }, + { + "auxiliary_loss_clip": 0.01196839, + "auxiliary_loss_mlp": 0.01189588, + "balance_loss_clip": 1.00455832, + "balance_loss_mlp": 1.0016191, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.3201711835021865, + "language_loss": 0.85383916, + "learning_rate": 3.512420411838642e-06, + "loss": 0.87770343, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.586994171142578 + }, + { + "auxiliary_loss_clip": 0.01196555, + "auxiliary_loss_mlp": 0.01189225, + "balance_loss_clip": 1.00429583, + "balance_loss_mlp": 1.00144613, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.880523363998594, + "language_loss": 0.89113188, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91498965, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.55352783203125 + }, + { + "auxiliary_loss_clip": 0.01196542, + "auxiliary_loss_mlp": 0.01189509, + "balance_loss_clip": 1.00431514, + "balance_loss_mlp": 1.00144446, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 3.2571281082247254, + "language_loss": 0.85520935, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.87906986, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.637554407119751 + }, + { + "auxiliary_loss_clip": 0.0119641, + "auxiliary_loss_mlp": 0.01189038, + "balance_loss_clip": 1.0041616, + "balance_loss_mlp": 1.00125992, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.058813950219459, + "language_loss": 0.8244766, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84833103, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.7099609375 + }, + { + "auxiliary_loss_clip": 0.01196567, + "auxiliary_loss_mlp": 0.01189732, + "balance_loss_clip": 1.00437641, + "balance_loss_mlp": 1.00166774, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 2.1245851768833632, + "language_loss": 0.77401447, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79787749, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.566415309906006 + }, + { + "auxiliary_loss_clip": 0.01196606, + "auxiliary_loss_mlp": 0.01189456, + "balance_loss_clip": 1.00438917, + "balance_loss_mlp": 1.00167727, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.8281357923626183, + "language_loss": 0.87085009, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89471072, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.5925590991973877 + }, + { + "auxiliary_loss_clip": 0.01196527, + "auxiliary_loss_mlp": 0.01188945, + "balance_loss_clip": 1.00426066, + "balance_loss_mlp": 1.00126231, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.1749401148389422, + "language_loss": 0.93141657, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95527124, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.6669843196868896 + }, + { + "auxiliary_loss_clip": 0.01196406, + "auxiliary_loss_mlp": 0.01189225, + "balance_loss_clip": 1.00419462, + "balance_loss_mlp": 1.00154173, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 3.426103889661656, + "language_loss": 0.8466984, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87055469, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.6646902561187744 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01189179, + "balance_loss_clip": 1.00424719, + "balance_loss_mlp": 1.00159156, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 2.058609606987996, + "language_loss": 0.8848272, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90868276, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.6137523651123047 + }, + { + "auxiliary_loss_clip": 0.01196373, + "auxiliary_loss_mlp": 0.01189216, + "balance_loss_clip": 1.00412416, + "balance_loss_mlp": 1.00134182, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 5.051360066709413, + "language_loss": 0.86883706, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89269292, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.576808214187622 + }, + { + "auxiliary_loss_clip": 0.0119637, + "auxiliary_loss_mlp": 0.01189424, + "balance_loss_clip": 1.00421357, + "balance_loss_mlp": 1.00164604, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.8075452742081761, + "language_loss": 0.84168983, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86554778, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.6170716285705566 + }, + { + "auxiliary_loss_clip": 0.01196511, + "auxiliary_loss_mlp": 0.01189344, + "balance_loss_clip": 1.00424123, + "balance_loss_mlp": 1.00147057, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.7689661125557135, + "language_loss": 0.78806072, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81191921, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 5.575216770172119 + }, + { + "auxiliary_loss_clip": 0.01196362, + "auxiliary_loss_mlp": 0.01189077, + "balance_loss_clip": 1.00417709, + "balance_loss_mlp": 1.00139344, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.5624180424629506, + "language_loss": 0.84111315, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86496758, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 5.518522500991821 + }, + { + "auxiliary_loss_clip": 0.0119631, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_clip": 1.00412512, + "balance_loss_mlp": 1.00137782, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9854106261296955, + "language_loss": 0.90040004, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92425376, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.584886074066162 + }, + { + "auxiliary_loss_clip": 0.01196299, + "auxiliary_loss_mlp": 0.01189028, + "balance_loss_clip": 1.00408578, + "balance_loss_mlp": 1.00124979, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.151576647482021, + "language_loss": 0.78147727, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80533057, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.733421802520752 + }, + { + "auxiliary_loss_clip": 0.01196364, + "auxiliary_loss_mlp": 0.01189197, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 1.00151408, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.458731892753671, + "language_loss": 0.83727783, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86113346, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.6504974365234375 + }, + { + "auxiliary_loss_clip": 0.0119621, + "auxiliary_loss_mlp": 0.0118909, + "balance_loss_clip": 1.004004, + "balance_loss_mlp": 1.00140738, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 1.9680816824392147, + "language_loss": 0.93394899, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.957802, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.6379153728485107 + }, + { + "auxiliary_loss_clip": 0.01196485, + "auxiliary_loss_mlp": 0.01189723, + "balance_loss_clip": 1.00425816, + "balance_loss_mlp": 1.00175345, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.3874419913350153, + "language_loss": 0.96718568, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99104768, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.6012063026428223 + }, + { + "auxiliary_loss_clip": 0.01196324, + "auxiliary_loss_mlp": 0.01189195, + "balance_loss_clip": 1.00417984, + "balance_loss_mlp": 1.00141621, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.228698382134234, + "language_loss": 0.84118509, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.8650403, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.6464290618896484 + }, + { + "auxiliary_loss_clip": 0.01196174, + "auxiliary_loss_mlp": 0.01189253, + "balance_loss_clip": 1.00408268, + "balance_loss_mlp": 1.00147438, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.261825244198546, + "language_loss": 0.98202634, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00588059, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.5677711963653564 + }, + { + "auxiliary_loss_clip": 0.01199131, + "auxiliary_loss_mlp": 0.01188601, + "balance_loss_clip": 1.00781679, + "balance_loss_mlp": 1.00168121, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8469953720459168, + "language_loss": 0.55625218, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.5801295, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.1368086338043213 + }, + { + "auxiliary_loss_clip": 0.01196214, + "auxiliary_loss_mlp": 0.01189516, + "balance_loss_clip": 1.00398469, + "balance_loss_mlp": 1.00164258, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.050384139013817, + "language_loss": 0.90024018, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92409754, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.6510586738586426 + }, + { + "auxiliary_loss_clip": 0.01196238, + "auxiliary_loss_mlp": 0.01189856, + "balance_loss_clip": 1.00403595, + "balance_loss_mlp": 1.00207782, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.3115248806887028, + "language_loss": 0.85664457, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8805055, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.636277437210083 + }, + { + "auxiliary_loss_clip": 0.0119619, + "auxiliary_loss_mlp": 0.01189262, + "balance_loss_clip": 1.00397873, + "balance_loss_mlp": 1.00157857, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 5.227535441729806, + "language_loss": 0.71289527, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73674977, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.555382490158081 + }, + { + "auxiliary_loss_clip": 0.01196154, + "auxiliary_loss_mlp": 0.01189001, + "balance_loss_clip": 1.00405383, + "balance_loss_mlp": 1.00150824, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.5736286883526787, + "language_loss": 0.94838643, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97223806, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.614017963409424 + }, + { + "auxiliary_loss_clip": 0.01196013, + "auxiliary_loss_mlp": 0.01188915, + "balance_loss_clip": 1.00390959, + "balance_loss_mlp": 1.00132751, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.829753977410553, + "language_loss": 0.92864573, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95249504, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.5737829208374023 + }, + { + "auxiliary_loss_clip": 0.0119595, + "auxiliary_loss_mlp": 0.01188875, + "balance_loss_clip": 1.00395107, + "balance_loss_mlp": 1.00128698, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.0650660928941136, + "language_loss": 0.97409713, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99794543, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.601501703262329 + }, + { + "auxiliary_loss_clip": 0.01196105, + "auxiliary_loss_mlp": 0.01189253, + "balance_loss_clip": 1.00400448, + "balance_loss_mlp": 1.00166559, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 14.174520521700918, + "language_loss": 0.87600207, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.89985561, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.641620397567749 + }, + { + "auxiliary_loss_clip": 0.01196172, + "auxiliary_loss_mlp": 0.01189067, + "balance_loss_clip": 1.00402784, + "balance_loss_mlp": 1.00157428, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 2.0201328901143314, + "language_loss": 0.67260492, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69645733, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.6124563217163086 + }, + { + "auxiliary_loss_clip": 0.01195973, + "auxiliary_loss_mlp": 0.0118921, + "balance_loss_clip": 1.00383759, + "balance_loss_mlp": 1.00181341, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 3.5277136581114217, + "language_loss": 0.6820966, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70594847, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.5632994174957275 + }, + { + "auxiliary_loss_clip": 0.0119598, + "auxiliary_loss_mlp": 0.01188958, + "balance_loss_clip": 1.00387132, + "balance_loss_mlp": 1.00146532, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.057270621812374, + "language_loss": 0.85140413, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87525344, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.5823919773101807 + }, + { + "auxiliary_loss_clip": 0.01196062, + "auxiliary_loss_mlp": 0.0118883, + "balance_loss_clip": 1.0039494, + "balance_loss_mlp": 1.0012424, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.73872428656617, + "language_loss": 1.0415709, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06541967, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.5532472133636475 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01189349, + "balance_loss_clip": 1.00406063, + "balance_loss_mlp": 1.00166583, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 5.606460729913339, + "language_loss": 0.75162256, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77547801, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.608656406402588 + }, + { + "auxiliary_loss_clip": 0.01195956, + "auxiliary_loss_mlp": 0.01188991, + "balance_loss_clip": 1.00383472, + "balance_loss_mlp": 1.00149846, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.8762503063741716, + "language_loss": 0.90500814, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.92885762, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.579796314239502 + }, + { + "auxiliary_loss_clip": 0.01195982, + "auxiliary_loss_mlp": 0.01189217, + "balance_loss_clip": 1.00397336, + "balance_loss_mlp": 1.00162923, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.6510008203472197, + "language_loss": 0.85694504, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88079703, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.7093358039855957 + }, + { + "auxiliary_loss_clip": 0.01196025, + "auxiliary_loss_mlp": 0.01188717, + "balance_loss_clip": 1.00404465, + "balance_loss_mlp": 1.00122499, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.1823689345240505, + "language_loss": 0.88392162, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90776902, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.6462905406951904 + }, + { + "auxiliary_loss_clip": 0.0119594, + "auxiliary_loss_mlp": 0.01189034, + "balance_loss_clip": 1.0039053, + "balance_loss_mlp": 1.00125527, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.5596375567053884, + "language_loss": 0.97109151, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99494123, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.5781362056732178 + }, + { + "auxiliary_loss_clip": 0.0119591, + "auxiliary_loss_mlp": 0.01188815, + "balance_loss_clip": 1.00400996, + "balance_loss_mlp": 1.00160849, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1603915847106676, + "language_loss": 0.86023933, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88408661, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.6612563133239746 + }, + { + "auxiliary_loss_clip": 0.01195946, + "auxiliary_loss_mlp": 0.01188961, + "balance_loss_clip": 1.00394166, + "balance_loss_mlp": 1.00137329, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 6.248247443649876, + "language_loss": 0.81340694, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83725595, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.645195960998535 + }, + { + "auxiliary_loss_clip": 0.01195777, + "auxiliary_loss_mlp": 0.01189287, + "balance_loss_clip": 1.00382137, + "balance_loss_mlp": 1.00179505, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.3287781716713356, + "language_loss": 0.81202984, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83588052, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.5727226734161377 + }, + { + "auxiliary_loss_clip": 0.01195928, + "auxiliary_loss_mlp": 0.01188814, + "balance_loss_clip": 1.00387239, + "balance_loss_mlp": 1.00122643, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.446279241896532, + "language_loss": 0.91424513, + "learning_rate": 3.614024787585744e-06, + "loss": 0.93809259, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.565382242202759 + }, + { + "auxiliary_loss_clip": 0.0119579, + "auxiliary_loss_mlp": 0.01189115, + "balance_loss_clip": 1.00377309, + "balance_loss_mlp": 1.0017184, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 2.0083857266985445, + "language_loss": 0.88079476, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90464377, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.603703022003174 + }, + { + "auxiliary_loss_clip": 0.01195738, + "auxiliary_loss_mlp": 0.01188843, + "balance_loss_clip": 1.00375462, + "balance_loss_mlp": 1.00135064, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.530661681031416, + "language_loss": 0.8031882, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.827034, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.6034293174743652 + }, + { + "auxiliary_loss_clip": 0.01195875, + "auxiliary_loss_mlp": 0.01189011, + "balance_loss_clip": 1.00393701, + "balance_loss_mlp": 1.00132823, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.869531541684868, + "language_loss": 0.81066048, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83450937, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.6793015003204346 + }, + { + "auxiliary_loss_clip": 0.0119552, + "auxiliary_loss_mlp": 0.01188639, + "balance_loss_clip": 1.00361085, + "balance_loss_mlp": 1.00114667, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.154722425788673, + "language_loss": 0.80320668, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82704824, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.521042823791504 + }, + { + "auxiliary_loss_clip": 0.01195781, + "auxiliary_loss_mlp": 0.01188669, + "balance_loss_clip": 1.00381851, + "balance_loss_mlp": 1.00127244, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.9151442525434552, + "language_loss": 0.90732956, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.9311741, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.618645429611206 + }, + { + "auxiliary_loss_clip": 0.0119569, + "auxiliary_loss_mlp": 0.01188983, + "balance_loss_clip": 1.0037545, + "balance_loss_mlp": 1.00168157, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.1239102800754526, + "language_loss": 0.94170201, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96554875, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.61035418510437 + }, + { + "auxiliary_loss_clip": 0.01195596, + "auxiliary_loss_mlp": 0.01188889, + "balance_loss_clip": 1.00366902, + "balance_loss_mlp": 1.00168312, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.0903714215144564, + "language_loss": 0.74147832, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76532316, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.638803482055664 + }, + { + "auxiliary_loss_clip": 0.01195815, + "auxiliary_loss_mlp": 0.01188873, + "balance_loss_clip": 1.0038842, + "balance_loss_mlp": 1.00147569, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.401064352388216, + "language_loss": 0.79976535, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82361221, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.546952962875366 + }, + { + "auxiliary_loss_clip": 0.01195703, + "auxiliary_loss_mlp": 0.01189513, + "balance_loss_clip": 1.00374031, + "balance_loss_mlp": 1.00183034, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 1.895186600728398, + "language_loss": 0.77346957, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.79732174, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 3.974947214126587 + }, + { + "auxiliary_loss_clip": 0.01195926, + "auxiliary_loss_mlp": 0.01188438, + "balance_loss_clip": 1.0040257, + "balance_loss_mlp": 1.00123167, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.186735121609281, + "language_loss": 0.84141731, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86526096, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 5.477049112319946 + }, + { + "auxiliary_loss_clip": 0.01195634, + "auxiliary_loss_mlp": 0.01188691, + "balance_loss_clip": 1.00375104, + "balance_loss_mlp": 1.0013895, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.8870692495352532, + "language_loss": 0.96906936, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99291265, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.594989061355591 + }, + { + "auxiliary_loss_clip": 0.01195769, + "auxiliary_loss_mlp": 0.01188924, + "balance_loss_clip": 1.00379229, + "balance_loss_mlp": 1.00124049, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 4.619112019954991, + "language_loss": 0.9365232, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96037012, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.623629570007324 + }, + { + "auxiliary_loss_clip": 0.01195627, + "auxiliary_loss_mlp": 0.01188566, + "balance_loss_clip": 1.00376534, + "balance_loss_mlp": 1.00107324, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.9439741797258183, + "language_loss": 0.92229116, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94613308, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.6309876441955566 + }, + { + "auxiliary_loss_clip": 0.01195585, + "auxiliary_loss_mlp": 0.01188472, + "balance_loss_clip": 1.00362134, + "balance_loss_mlp": 1.00117064, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2559693867570694, + "language_loss": 1.01977098, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04361153, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.5223381519317627 + }, + { + "auxiliary_loss_clip": 0.01198998, + "auxiliary_loss_mlp": 0.01188751, + "balance_loss_clip": 1.0080092, + "balance_loss_mlp": 1.00183094, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 1.3594241660625728, + "language_loss": 0.63883185, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66270936, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.249376058578491 + }, + { + "auxiliary_loss_clip": 0.01195674, + "auxiliary_loss_mlp": 0.01189159, + "balance_loss_clip": 1.00377202, + "balance_loss_mlp": 1.00176179, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4304404407347864, + "language_loss": 0.88391376, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90776211, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.5393757820129395 + }, + { + "auxiliary_loss_clip": 0.01195643, + "auxiliary_loss_mlp": 0.01188211, + "balance_loss_clip": 1.00376201, + "balance_loss_mlp": 1.00090921, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.2816134447887046, + "language_loss": 0.84896648, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.872805, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.6595816612243652 + }, + { + "auxiliary_loss_clip": 0.01195564, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_clip": 1.0038141, + "balance_loss_mlp": 1.00138068, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6692776930278728, + "language_loss": 0.72630584, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.7501474, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.61893892288208 + }, + { + "auxiliary_loss_clip": 0.01195553, + "auxiliary_loss_mlp": 0.01188857, + "balance_loss_clip": 1.00370741, + "balance_loss_mlp": 1.0014596, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 7.0624547479321995, + "language_loss": 0.87359494, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89743906, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.5748050212860107 + }, + { + "auxiliary_loss_clip": 0.01195666, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_clip": 1.00385058, + "balance_loss_mlp": 1.00140822, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 20.034359342153394, + "language_loss": 0.81081378, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83465654, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.5933449268341064 + }, + { + "auxiliary_loss_clip": 0.01195398, + "auxiliary_loss_mlp": 0.01188896, + "balance_loss_clip": 1.00355768, + "balance_loss_mlp": 1.00168943, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 1.9781417136587323, + "language_loss": 0.83865386, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86249673, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.6220922470092773 + }, + { + "auxiliary_loss_clip": 0.01195729, + "auxiliary_loss_mlp": 0.01188665, + "balance_loss_clip": 1.00386918, + "balance_loss_mlp": 1.00164962, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.112875556248863, + "language_loss": 0.84536672, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.86921072, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.5712130069732666 + }, + { + "auxiliary_loss_clip": 0.01195782, + "auxiliary_loss_mlp": 0.0118864, + "balance_loss_clip": 1.00385833, + "balance_loss_mlp": 1.00152946, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.6979247988913184, + "language_loss": 0.8791495, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90299374, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.6135213375091553 + }, + { + "auxiliary_loss_clip": 0.01195564, + "auxiliary_loss_mlp": 0.01188805, + "balance_loss_clip": 1.0036937, + "balance_loss_mlp": 1.00150371, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 3.1294667175842545, + "language_loss": 0.88789904, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91174269, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.5547657012939453 + }, + { + "auxiliary_loss_clip": 0.01195532, + "auxiliary_loss_mlp": 0.0118895, + "balance_loss_clip": 1.00378466, + "balance_loss_mlp": 1.00193441, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 2.333133484616308, + "language_loss": 0.88892817, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91277301, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.5570220947265625 + }, + { + "auxiliary_loss_clip": 0.01195464, + "auxiliary_loss_mlp": 0.01188571, + "balance_loss_clip": 1.00366378, + "balance_loss_mlp": 1.00146019, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.3390607789257407, + "language_loss": 0.64883029, + "learning_rate": 3.672392800539357e-06, + "loss": 0.6726706, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.588435173034668 + }, + { + "auxiliary_loss_clip": 0.011955, + "auxiliary_loss_mlp": 0.01188665, + "balance_loss_clip": 1.00377977, + "balance_loss_mlp": 1.00164998, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.829269445638537, + "language_loss": 0.88278371, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90662539, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.539280891418457 + }, + { + "auxiliary_loss_clip": 0.01198783, + "auxiliary_loss_mlp": 0.01188472, + "balance_loss_clip": 1.00778008, + "balance_loss_mlp": 1.00155163, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8325467453190081, + "language_loss": 0.62214267, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64601517, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.252804756164551 + }, + { + "auxiliary_loss_clip": 0.01195289, + "auxiliary_loss_mlp": 0.01189085, + "balance_loss_clip": 1.00351954, + "balance_loss_mlp": 1.00178325, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.056140613854408, + "language_loss": 0.89784336, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92168713, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.5697288513183594 + }, + { + "auxiliary_loss_clip": 0.01195499, + "auxiliary_loss_mlp": 0.01188873, + "balance_loss_clip": 1.00369358, + "balance_loss_mlp": 1.00176263, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.9778821756159926, + "language_loss": 0.80198956, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82583332, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.6063811779022217 + }, + { + "auxiliary_loss_clip": 0.01195454, + "auxiliary_loss_mlp": 0.01188452, + "balance_loss_clip": 1.00373411, + "balance_loss_mlp": 1.00143635, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 2.2205545723769418, + "language_loss": 0.82819581, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85203493, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.586529493331909 + }, + { + "auxiliary_loss_clip": 0.01195521, + "auxiliary_loss_mlp": 0.01188429, + "balance_loss_clip": 1.00371802, + "balance_loss_mlp": 1.00131774, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9231835063220308, + "language_loss": 0.90978932, + "learning_rate": 3.685142765363119e-06, + "loss": 0.9336288, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.562018632888794 + }, + { + "auxiliary_loss_clip": 0.01195364, + "auxiliary_loss_mlp": 0.01188413, + "balance_loss_clip": 1.00357842, + "balance_loss_mlp": 1.00139785, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.2552085745954034, + "language_loss": 0.86756134, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89139915, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.6537044048309326 + }, + { + "auxiliary_loss_clip": 0.01195381, + "auxiliary_loss_mlp": 0.01188471, + "balance_loss_clip": 1.00367093, + "balance_loss_mlp": 1.00155091, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 1.974363940494778, + "language_loss": 0.71472067, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.73855913, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.575575828552246 + }, + { + "auxiliary_loss_clip": 0.01195464, + "auxiliary_loss_mlp": 0.01188652, + "balance_loss_clip": 1.00360298, + "balance_loss_mlp": 1.00144589, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.43359168245498, + "language_loss": 0.92082226, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94466347, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.5771734714508057 + }, + { + "auxiliary_loss_clip": 0.01195568, + "auxiliary_loss_mlp": 0.01188763, + "balance_loss_clip": 1.00366747, + "balance_loss_mlp": 1.00146139, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.1614778912122956, + "language_loss": 0.72566652, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74950981, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.6609530448913574 + }, + { + "auxiliary_loss_clip": 0.01195349, + "auxiliary_loss_mlp": 0.01188875, + "balance_loss_clip": 1.00359178, + "balance_loss_mlp": 1.0017643, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 1.8780252098242223, + "language_loss": 0.74115109, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76499331, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.78155255317688 + }, + { + "auxiliary_loss_clip": 0.01195433, + "auxiliary_loss_mlp": 0.01188596, + "balance_loss_clip": 1.00369883, + "balance_loss_mlp": 1.00167632, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 3.8691758078380696, + "language_loss": 0.91490495, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93874526, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.5974278450012207 + }, + { + "auxiliary_loss_clip": 0.01195386, + "auxiliary_loss_mlp": 0.01188717, + "balance_loss_clip": 1.0036025, + "balance_loss_mlp": 1.00179648, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.8776234893342836, + "language_loss": 0.89961588, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92345691, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.5584375858306885 + }, + { + "auxiliary_loss_clip": 0.01195326, + "auxiliary_loss_mlp": 0.01188722, + "balance_loss_clip": 1.00355446, + "balance_loss_mlp": 1.00161123, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 4.303422069916444, + "language_loss": 0.73188233, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75572276, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.5762665271759033 + }, + { + "auxiliary_loss_clip": 0.0119527, + "auxiliary_loss_mlp": 0.01188736, + "balance_loss_clip": 1.00353765, + "balance_loss_mlp": 1.00143456, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.708412946593191, + "language_loss": 0.89767879, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.9215188, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.6736655235290527 + }, + { + "auxiliary_loss_clip": 0.01195404, + "auxiliary_loss_mlp": 0.01188295, + "balance_loss_clip": 1.00360608, + "balance_loss_mlp": 1.00118399, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.7625240118474566, + "language_loss": 0.80305612, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.82689309, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.572908878326416 + }, + { + "auxiliary_loss_clip": 0.01195169, + "auxiliary_loss_mlp": 0.01188319, + "balance_loss_clip": 1.00350189, + "balance_loss_mlp": 1.00120783, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.2672071796767055, + "language_loss": 0.90152627, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92536116, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.5416672229766846 + }, + { + "auxiliary_loss_clip": 0.01195296, + "auxiliary_loss_mlp": 0.01188257, + "balance_loss_clip": 1.00365365, + "balance_loss_mlp": 1.00133646, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.264440424543427, + "language_loss": 0.91068876, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93452418, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.553959846496582 + }, + { + "auxiliary_loss_clip": 0.01195168, + "auxiliary_loss_mlp": 0.01188363, + "balance_loss_clip": 1.00345802, + "balance_loss_mlp": 1.0013479, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 4.109905666495919, + "language_loss": 0.93955028, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96338564, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.613614320755005 + }, + { + "auxiliary_loss_clip": 0.01198768, + "auxiliary_loss_mlp": 0.01188142, + "balance_loss_clip": 1.00769436, + "balance_loss_mlp": 1.00122166, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9104571850474849, + "language_loss": 0.59807879, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62194788, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.0464274883270264 + }, + { + "auxiliary_loss_clip": 0.01195132, + "auxiliary_loss_mlp": 0.01188673, + "balance_loss_clip": 1.00351214, + "balance_loss_mlp": 1.00175261, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 2.2918143231589463, + "language_loss": 0.9025768, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92641485, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 4.013676166534424 + }, + { + "auxiliary_loss_clip": 0.01195295, + "auxiliary_loss_mlp": 0.0118882, + "balance_loss_clip": 1.00366437, + "balance_loss_mlp": 1.00170898, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.320300382415401, + "language_loss": 0.82910109, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85294223, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 4.057531356811523 + }, + { + "auxiliary_loss_clip": 0.011953, + "auxiliary_loss_mlp": 0.01188189, + "balance_loss_clip": 1.00352287, + "balance_loss_mlp": 1.00117314, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 5.468425056502706, + "language_loss": 0.72536957, + "learning_rate": 3.719954063833981e-06, + "loss": 0.7492044, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 3.9863829612731934 + }, + { + "auxiliary_loss_clip": 0.01195254, + "auxiliary_loss_mlp": 0.01188147, + "balance_loss_clip": 1.00345945, + "balance_loss_mlp": 1.00122666, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.129466854218828, + "language_loss": 0.92396808, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94780207, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.5475199222564697 + }, + { + "auxiliary_loss_clip": 0.01195306, + "auxiliary_loss_mlp": 0.0118851, + "balance_loss_clip": 1.00357151, + "balance_loss_mlp": 1.00139868, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 2.4189331273426475, + "language_loss": 0.65348065, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67731881, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.5470056533813477 + }, + { + "auxiliary_loss_clip": 0.01195237, + "auxiliary_loss_mlp": 0.01188591, + "balance_loss_clip": 1.00359631, + "balance_loss_mlp": 1.00195718, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 2.096142327119798, + "language_loss": 0.76732314, + "learning_rate": 3.72590651470665e-06, + "loss": 0.79116142, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.5564210414886475 + }, + { + "auxiliary_loss_clip": 0.0119512, + "auxiliary_loss_mlp": 0.01188688, + "balance_loss_clip": 1.0033592, + "balance_loss_mlp": 1.001863, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.2848046102759345, + "language_loss": 0.7991088, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82294691, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.5955514907836914 + }, + { + "auxiliary_loss_clip": 0.01195196, + "auxiliary_loss_mlp": 0.01188287, + "balance_loss_clip": 1.00365901, + "balance_loss_mlp": 1.00146198, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 3.440695903923667, + "language_loss": 0.8135674, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83740222, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.5630486011505127 + }, + { + "auxiliary_loss_clip": 0.01195203, + "auxiliary_loss_mlp": 0.01188323, + "balance_loss_clip": 1.00344229, + "balance_loss_mlp": 1.00130773, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.4883080555826993, + "language_loss": 0.93414974, + "learning_rate": 3.731804438545683e-06, + "loss": 0.95798498, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.5643832683563232 + }, + { + "auxiliary_loss_clip": 0.01195181, + "auxiliary_loss_mlp": 0.0118859, + "balance_loss_clip": 1.00351572, + "balance_loss_mlp": 1.00186062, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.6088325390669245, + "language_loss": 0.75288117, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77671885, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.571936845779419 + }, + { + "auxiliary_loss_clip": 0.0119509, + "auxiliary_loss_mlp": 0.01188661, + "balance_loss_clip": 1.00349712, + "balance_loss_mlp": 1.00174093, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.505305073727932, + "language_loss": 0.93551284, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.95935035, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.526503801345825 + }, + { + "auxiliary_loss_clip": 0.01195122, + "auxiliary_loss_mlp": 0.01188042, + "balance_loss_clip": 1.00358057, + "balance_loss_mlp": 1.0011214, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.1765051745765494, + "language_loss": 0.92704606, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95087773, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.5513601303100586 + }, + { + "auxiliary_loss_clip": 0.01195072, + "auxiliary_loss_mlp": 0.01188048, + "balance_loss_clip": 1.00343251, + "balance_loss_mlp": 1.00122309, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 3.2208798377434773, + "language_loss": 0.7537902, + "learning_rate": 3.739585224276384e-06, + "loss": 0.77762139, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.559389591217041 + }, + { + "auxiliary_loss_clip": 0.01195228, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_clip": 1.00359058, + "balance_loss_mlp": 1.0012784, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.4666783539174104, + "language_loss": 0.78655684, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81039017, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.6480915546417236 + }, + { + "auxiliary_loss_clip": 0.01195027, + "auxiliary_loss_mlp": 0.01188465, + "balance_loss_clip": 1.0033989, + "balance_loss_mlp": 1.00154459, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.9927874318141041, + "language_loss": 0.83338183, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85721684, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.5345828533172607 + }, + { + "auxiliary_loss_clip": 0.01194938, + "auxiliary_loss_mlp": 0.01188063, + "balance_loss_clip": 1.00329876, + "balance_loss_mlp": 1.00123835, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.510035798244958, + "language_loss": 0.92375904, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94758904, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.525528907775879 + }, + { + "auxiliary_loss_clip": 0.0119491, + "auxiliary_loss_mlp": 0.01188183, + "balance_loss_clip": 1.00334454, + "balance_loss_mlp": 1.00126278, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.9391462109598028, + "language_loss": 0.88226902, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90610003, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.532036781311035 + }, + { + "auxiliary_loss_clip": 0.01194766, + "auxiliary_loss_mlp": 0.01188385, + "balance_loss_clip": 1.00321305, + "balance_loss_mlp": 1.0014652, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5503744322813036, + "language_loss": 0.9011656, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92499709, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.581869602203369 + }, + { + "auxiliary_loss_clip": 0.01195006, + "auxiliary_loss_mlp": 0.011884, + "balance_loss_clip": 1.00343227, + "balance_loss_mlp": 1.00157559, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 1.9688532295598689, + "language_loss": 0.84874403, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87257814, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.6538844108581543 + }, + { + "auxiliary_loss_clip": 0.01194965, + "auxiliary_loss_mlp": 0.01188483, + "balance_loss_clip": 1.0032959, + "balance_loss_mlp": 1.00156307, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.6999441456646032, + "language_loss": 0.88792145, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91175592, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.552870035171509 + }, + { + "auxiliary_loss_clip": 0.01194952, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_clip": 1.0033766, + "balance_loss_mlp": 1.00111282, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 1.9137512324111925, + "language_loss": 0.88224953, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90608037, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.5932886600494385 + }, + { + "auxiliary_loss_clip": 0.01194983, + "auxiliary_loss_mlp": 0.01188944, + "balance_loss_clip": 1.00337887, + "balance_loss_mlp": 1.00202382, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.3233025440835346, + "language_loss": 0.80806589, + "learning_rate": 3.756755633390458e-06, + "loss": 0.83190513, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.577545404434204 + }, + { + "auxiliary_loss_clip": 0.01194866, + "auxiliary_loss_mlp": 0.01188565, + "balance_loss_clip": 1.003245, + "balance_loss_mlp": 1.00164509, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 3.517670704551317, + "language_loss": 0.89147085, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91530514, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.5782339572906494 + }, + { + "auxiliary_loss_clip": 0.01194943, + "auxiliary_loss_mlp": 0.01188045, + "balance_loss_clip": 1.00343251, + "balance_loss_mlp": 1.00141144, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 11.710222213962458, + "language_loss": 0.7834481, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80727804, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.5466370582580566 + }, + { + "auxiliary_loss_clip": 0.01194733, + "auxiliary_loss_mlp": 0.01188582, + "balance_loss_clip": 1.00318265, + "balance_loss_mlp": 1.00166178, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 2.3136955463836264, + "language_loss": 0.75099337, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77482653, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.517364740371704 + }, + { + "auxiliary_loss_clip": 0.01194882, + "auxiliary_loss_mlp": 0.01188355, + "balance_loss_clip": 1.00330627, + "balance_loss_mlp": 1.0016259, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.454389000766882, + "language_loss": 0.90311527, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92694771, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.5640053749084473 + }, + { + "auxiliary_loss_clip": 0.01194704, + "auxiliary_loss_mlp": 0.01188314, + "balance_loss_clip": 1.00316942, + "balance_loss_mlp": 1.0013938, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.27488738825294, + "language_loss": 0.79083055, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81466079, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.5898449420928955 + }, + { + "auxiliary_loss_clip": 0.01194709, + "auxiliary_loss_mlp": 0.01188417, + "balance_loss_clip": 1.00330853, + "balance_loss_mlp": 1.00159192, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 1.8114246053569534, + "language_loss": 0.71307689, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73690814, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.562281847000122 + }, + { + "auxiliary_loss_clip": 0.01194779, + "auxiliary_loss_mlp": 0.01188558, + "balance_loss_clip": 1.00316095, + "balance_loss_mlp": 1.00163817, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.409465136827928, + "language_loss": 0.76712865, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79096198, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.503497838973999 + }, + { + "auxiliary_loss_clip": 0.01194816, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_clip": 1.00335324, + "balance_loss_mlp": 1.00171232, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.8194493330932395, + "language_loss": 0.8510316, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.8748641, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.584508180618286 + }, + { + "auxiliary_loss_clip": 0.01194804, + "auxiliary_loss_mlp": 0.01188386, + "balance_loss_clip": 1.00325274, + "balance_loss_mlp": 1.0015614, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.052739341584163, + "language_loss": 0.80144179, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82527369, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.5447182655334473 + }, + { + "auxiliary_loss_clip": 0.01194803, + "auxiliary_loss_mlp": 0.01188372, + "balance_loss_clip": 1.00334549, + "balance_loss_mlp": 1.00192857, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 1.9854599117497853, + "language_loss": 0.87675285, + "learning_rate": 3.775311735671078e-06, + "loss": 0.90058458, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.5146799087524414 + }, + { + "auxiliary_loss_clip": 0.0119485, + "auxiliary_loss_mlp": 0.0118839, + "balance_loss_clip": 1.00334167, + "balance_loss_mlp": 1.00166059, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 2.1421036519026115, + "language_loss": 0.82646334, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.85029572, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.5842132568359375 + }, + { + "auxiliary_loss_clip": 0.01194838, + "auxiliary_loss_mlp": 0.01187901, + "balance_loss_clip": 1.00342464, + "balance_loss_mlp": 1.00126743, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 1.9021294847109866, + "language_loss": 0.8084743, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83230174, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.545301675796509 + }, + { + "auxiliary_loss_clip": 0.01194696, + "auxiliary_loss_mlp": 0.01188237, + "balance_loss_clip": 1.00317132, + "balance_loss_mlp": 1.00141215, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.7957004208145078, + "language_loss": 0.80908895, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83291829, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.55830717086792 + }, + { + "auxiliary_loss_clip": 0.01194727, + "auxiliary_loss_mlp": 0.01188038, + "balance_loss_clip": 1.00322914, + "balance_loss_mlp": 1.00140452, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.250405915454279, + "language_loss": 0.89802092, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.92184854, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.503525972366333 + }, + { + "auxiliary_loss_clip": 0.01194718, + "auxiliary_loss_mlp": 0.01187876, + "balance_loss_clip": 1.00322711, + "balance_loss_mlp": 1.00124216, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.1978055375382026, + "language_loss": 0.80423105, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82805699, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.5841333866119385 + }, + { + "auxiliary_loss_clip": 0.01194648, + "auxiliary_loss_mlp": 0.01188293, + "balance_loss_clip": 1.00315332, + "balance_loss_mlp": 1.0014677, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.715540908064602, + "language_loss": 0.76849151, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79232097, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.4995839595794678 + }, + { + "auxiliary_loss_clip": 0.011946, + "auxiliary_loss_mlp": 0.01188035, + "balance_loss_clip": 1.00312519, + "balance_loss_mlp": 1.00140119, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.5378899233966394, + "language_loss": 0.88265735, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90648365, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 3.9162282943725586 + }, + { + "auxiliary_loss_clip": 0.01194581, + "auxiliary_loss_mlp": 0.01188033, + "balance_loss_clip": 1.00314713, + "balance_loss_mlp": 1.00130332, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.383769624555984, + "language_loss": 0.76181877, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78564489, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 4.020461320877075 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01187651, + "balance_loss_clip": 1.0071187, + "balance_loss_mlp": 1.00149333, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8323967274699676, + "language_loss": 0.64869589, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67254996, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 6.0937910079956055 + }, + { + "auxiliary_loss_clip": 0.01194452, + "auxiliary_loss_mlp": 0.01187838, + "balance_loss_clip": 1.00302303, + "balance_loss_mlp": 1.00139499, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 3.73366030919542, + "language_loss": 0.78407115, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80789411, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.5662319660186768 + }, + { + "auxiliary_loss_clip": 0.01194501, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_clip": 1.00302744, + "balance_loss_mlp": 1.00157225, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.0327584527290368, + "language_loss": 0.92233992, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.9461689, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.5608181953430176 + }, + { + "auxiliary_loss_clip": 0.0119463, + "auxiliary_loss_mlp": 0.01188174, + "balance_loss_clip": 1.00313354, + "balance_loss_mlp": 1.00163507, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.2463867395606942, + "language_loss": 0.89579952, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.91962755, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.5726122856140137 + }, + { + "auxiliary_loss_clip": 0.01194724, + "auxiliary_loss_mlp": 0.01187924, + "balance_loss_clip": 1.0032388, + "balance_loss_mlp": 1.00148034, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.2536537073721026, + "language_loss": 0.79409206, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81791854, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.5672404766082764 + }, + { + "auxiliary_loss_clip": 0.01194409, + "auxiliary_loss_mlp": 0.0118785, + "balance_loss_clip": 1.00305271, + "balance_loss_mlp": 1.00150228, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 5.695729201561736, + "language_loss": 0.84407079, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.8678934, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.525324583053589 + }, + { + "auxiliary_loss_clip": 0.01194585, + "auxiliary_loss_mlp": 0.01187781, + "balance_loss_clip": 1.00318193, + "balance_loss_mlp": 1.00143313, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 1.8332072451192605, + "language_loss": 0.86973345, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89355713, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.5448355674743652 + }, + { + "auxiliary_loss_clip": 0.01194586, + "auxiliary_loss_mlp": 0.01188103, + "balance_loss_clip": 1.00314331, + "balance_loss_mlp": 1.00156379, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.026925652221666, + "language_loss": 0.84865546, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87248236, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.5379931926727295 + }, + { + "auxiliary_loss_clip": 0.01194533, + "auxiliary_loss_mlp": 0.01188121, + "balance_loss_clip": 1.00304568, + "balance_loss_mlp": 1.00139165, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 2.4768702431881637, + "language_loss": 0.75547993, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77930641, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.541633129119873 + }, + { + "auxiliary_loss_clip": 0.01194548, + "auxiliary_loss_mlp": 0.0118831, + "balance_loss_clip": 1.00307822, + "balance_loss_mlp": 1.00177121, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 3.6203469599351727, + "language_loss": 0.82952607, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85335457, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.5924460887908936 + }, + { + "auxiliary_loss_clip": 0.01194406, + "auxiliary_loss_mlp": 0.01187993, + "balance_loss_clip": 1.00301194, + "balance_loss_mlp": 1.00145483, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.8337364424252476, + "language_loss": 0.81942439, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84324837, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.5853641033172607 + }, + { + "auxiliary_loss_clip": 0.01194618, + "auxiliary_loss_mlp": 0.01187877, + "balance_loss_clip": 1.00327075, + "balance_loss_mlp": 1.00124252, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.1593947665948265, + "language_loss": 0.83277559, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85660052, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.5737948417663574 + }, + { + "auxiliary_loss_clip": 0.01194288, + "auxiliary_loss_mlp": 0.01187996, + "balance_loss_clip": 1.00298858, + "balance_loss_mlp": 1.00155294, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.630119288378733, + "language_loss": 0.78654492, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.8103677, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.4771957397460938 + }, + { + "auxiliary_loss_clip": 0.01194411, + "auxiliary_loss_mlp": 0.01187788, + "balance_loss_clip": 1.00307894, + "balance_loss_mlp": 1.00143981, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 3.980499448875705, + "language_loss": 0.78034407, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80416608, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.490527868270874 + }, + { + "auxiliary_loss_clip": 0.01194164, + "auxiliary_loss_mlp": 0.01187544, + "balance_loss_clip": 1.00281072, + "balance_loss_mlp": 1.0011003, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.7003387229673588, + "language_loss": 0.85987973, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88369679, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.589250087738037 + }, + { + "auxiliary_loss_clip": 0.01194448, + "auxiliary_loss_mlp": 0.01188285, + "balance_loss_clip": 1.0031054, + "balance_loss_mlp": 1.00174618, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.07382154147213, + "language_loss": 0.8919906, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91581798, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.5244951248168945 + }, + { + "auxiliary_loss_clip": 0.01194375, + "auxiliary_loss_mlp": 0.01187525, + "balance_loss_clip": 1.00305247, + "balance_loss_mlp": 1.00117731, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 2.421184397317959, + "language_loss": 0.75026274, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77408171, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.4703900814056396 + }, + { + "auxiliary_loss_clip": 0.01194276, + "auxiliary_loss_mlp": 0.01187259, + "balance_loss_clip": 1.00299692, + "balance_loss_mlp": 1.00110197, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.5754340192796112, + "language_loss": 0.99123561, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01505101, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.5063281059265137 + }, + { + "auxiliary_loss_clip": 0.01197161, + "auxiliary_loss_mlp": 0.01186624, + "balance_loss_clip": 1.0066942, + "balance_loss_mlp": 1.00122988, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9672963992155093, + "language_loss": 0.75465912, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77849698, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.160263776779175 + }, + { + "auxiliary_loss_clip": 0.01194312, + "auxiliary_loss_mlp": 0.01188173, + "balance_loss_clip": 1.00291634, + "balance_loss_mlp": 1.00153923, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.7460829729165375, + "language_loss": 0.78277147, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80659634, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.6773200035095215 + }, + { + "auxiliary_loss_clip": 0.0119434, + "auxiliary_loss_mlp": 0.01187979, + "balance_loss_clip": 1.00302553, + "balance_loss_mlp": 1.00163102, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.200951239316481, + "language_loss": 0.96422482, + "learning_rate": 3.826284353801652e-06, + "loss": 0.98804802, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.53548264503479 + }, + { + "auxiliary_loss_clip": 0.01194427, + "auxiliary_loss_mlp": 0.0118829, + "balance_loss_clip": 1.00306821, + "balance_loss_mlp": 1.00184691, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.139706749702589, + "language_loss": 0.87832069, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90214789, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.550388813018799 + }, + { + "auxiliary_loss_clip": 0.01194436, + "auxiliary_loss_mlp": 0.01188168, + "balance_loss_clip": 1.00310206, + "balance_loss_mlp": 1.00191522, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.2228206257908645, + "language_loss": 0.84728253, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87110853, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.533430337905884 + }, + { + "auxiliary_loss_clip": 0.01194273, + "auxiliary_loss_mlp": 0.01188413, + "balance_loss_clip": 1.00299025, + "balance_loss_mlp": 1.00216079, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.4634624602942914, + "language_loss": 0.83531302, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85913992, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.5267462730407715 + }, + { + "auxiliary_loss_clip": 0.0119441, + "auxiliary_loss_mlp": 0.0118796, + "balance_loss_clip": 1.00314856, + "balance_loss_mlp": 1.00170743, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8017755238042872, + "language_loss": 0.89167833, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91550207, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.5286874771118164 + }, + { + "auxiliary_loss_clip": 0.01194369, + "auxiliary_loss_mlp": 0.0118864, + "balance_loss_clip": 1.00304985, + "balance_loss_mlp": 1.00229216, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.9516363609296403, + "language_loss": 0.6975944, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72142452, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.5350146293640137 + }, + { + "auxiliary_loss_clip": 0.01194216, + "auxiliary_loss_mlp": 0.01188029, + "balance_loss_clip": 1.00293851, + "balance_loss_mlp": 1.00158596, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.5998641455165146, + "language_loss": 0.87870806, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90253055, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.5629093647003174 + }, + { + "auxiliary_loss_clip": 0.01194345, + "auxiliary_loss_mlp": 0.01187506, + "balance_loss_clip": 1.00311589, + "balance_loss_mlp": 1.00115824, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 1.9591207430983302, + "language_loss": 0.83502507, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85884356, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.60825514793396 + }, + { + "auxiliary_loss_clip": 0.01194271, + "auxiliary_loss_mlp": 0.01188014, + "balance_loss_clip": 1.002913, + "balance_loss_mlp": 1.00157034, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.532482081520095, + "language_loss": 0.93639976, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96022266, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.559947967529297 + }, + { + "auxiliary_loss_clip": 0.01194223, + "auxiliary_loss_mlp": 0.01187521, + "balance_loss_clip": 1.00298476, + "balance_loss_mlp": 1.00126863, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.9197832584174663, + "language_loss": 0.87782896, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90164638, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.5381293296813965 + }, + { + "auxiliary_loss_clip": 0.01194328, + "auxiliary_loss_mlp": 0.01187684, + "balance_loss_clip": 1.0030241, + "balance_loss_mlp": 1.00143147, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 3.7630203189076967, + "language_loss": 0.89253902, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91635913, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.5604498386383057 + }, + { + "auxiliary_loss_clip": 0.01194329, + "auxiliary_loss_mlp": 0.0118755, + "balance_loss_clip": 1.00299168, + "balance_loss_mlp": 1.00139284, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.166468459858644, + "language_loss": 0.86002028, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88383907, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.593806743621826 + }, + { + "auxiliary_loss_clip": 0.01194122, + "auxiliary_loss_mlp": 0.01187516, + "balance_loss_clip": 1.00300908, + "balance_loss_mlp": 1.00135827, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 1.9509393664085541, + "language_loss": 0.89207464, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91589105, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.5125112533569336 + }, + { + "auxiliary_loss_clip": 0.01194168, + "auxiliary_loss_mlp": 0.01188255, + "balance_loss_clip": 1.00303972, + "balance_loss_mlp": 1.00209761, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0965524252719328, + "language_loss": 0.81438482, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83820903, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.507640838623047 + }, + { + "auxiliary_loss_clip": 0.01194065, + "auxiliary_loss_mlp": 0.01187923, + "balance_loss_clip": 1.00287604, + "balance_loss_mlp": 1.00157487, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.4527311769506883, + "language_loss": 0.85841876, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88223869, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.5314626693725586 + }, + { + "auxiliary_loss_clip": 0.01197059, + "auxiliary_loss_mlp": 0.01186205, + "balance_loss_clip": 1.00670516, + "balance_loss_mlp": 1.00081062, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 1.0529218531669926, + "language_loss": 0.63803983, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66187245, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 2.9530794620513916 + }, + { + "auxiliary_loss_clip": 0.01194181, + "auxiliary_loss_mlp": 0.0118751, + "balance_loss_clip": 1.00301874, + "balance_loss_mlp": 1.00125766, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 5.972070102785061, + "language_loss": 0.83595115, + "learning_rate": 3.852770440269372e-06, + "loss": 0.85976803, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 3.967302083969116 + }, + { + "auxiliary_loss_clip": 0.01194306, + "auxiliary_loss_mlp": 0.01187908, + "balance_loss_clip": 1.00298655, + "balance_loss_mlp": 1.00165486, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.3924224584089058, + "language_loss": 0.84240031, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86622238, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 3.998319625854492 + }, + { + "auxiliary_loss_clip": 0.01194098, + "auxiliary_loss_mlp": 0.01187702, + "balance_loss_clip": 1.002846, + "balance_loss_mlp": 1.00116348, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.6583442950033, + "language_loss": 0.86148369, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88530171, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.503230333328247 + }, + { + "auxiliary_loss_clip": 0.01194168, + "auxiliary_loss_mlp": 0.011877, + "balance_loss_clip": 1.00300074, + "balance_loss_mlp": 1.00163794, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.0598290366032384, + "language_loss": 0.86119854, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88501728, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 5.315576076507568 + }, + { + "auxiliary_loss_clip": 0.01194175, + "auxiliary_loss_mlp": 0.01187672, + "balance_loss_clip": 1.00300395, + "balance_loss_mlp": 1.00151515, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 1.7376255702409658, + "language_loss": 0.79263407, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81645256, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.537804365158081 + }, + { + "auxiliary_loss_clip": 0.011941, + "auxiliary_loss_mlp": 0.01187478, + "balance_loss_clip": 1.00293732, + "balance_loss_mlp": 1.00141609, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.0296242860046396, + "language_loss": 0.78475863, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80857444, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.5855441093444824 + }, + { + "auxiliary_loss_clip": 0.01194164, + "auxiliary_loss_mlp": 0.01187452, + "balance_loss_clip": 1.0029062, + "balance_loss_mlp": 1.00138986, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.441976497811956, + "language_loss": 0.9465605, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97037673, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.515627861022949 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01187555, + "balance_loss_clip": 1.00287282, + "balance_loss_mlp": 1.00130272, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.484823216718209, + "language_loss": 0.99764818, + "learning_rate": 3.864024073288798e-06, + "loss": 1.0214653, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.4927263259887695 + }, + { + "auxiliary_loss_clip": 0.01194181, + "auxiliary_loss_mlp": 0.01187922, + "balance_loss_clip": 1.00296032, + "balance_loss_mlp": 1.00166965, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 3.8762247829627934, + "language_loss": 0.87546408, + "learning_rate": 3.865615797668091e-06, + "loss": 0.89928508, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.4894440174102783 + }, + { + "auxiliary_loss_clip": 0.01194548, + "auxiliary_loss_mlp": 0.01188137, + "balance_loss_clip": 1.00325692, + "balance_loss_mlp": 1.00169337, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.3301426853075804, + "language_loss": 0.93163627, + "learning_rate": 3.867203596705844e-06, + "loss": 0.95546311, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.5131354331970215 + }, + { + "auxiliary_loss_clip": 0.01194176, + "auxiliary_loss_mlp": 0.01187627, + "balance_loss_clip": 1.00294089, + "balance_loss_mlp": 1.00146961, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.13588108138142, + "language_loss": 0.87258345, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89640152, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.524601459503174 + }, + { + "auxiliary_loss_clip": 0.01194317, + "auxiliary_loss_mlp": 0.01187667, + "balance_loss_clip": 1.00315642, + "balance_loss_mlp": 1.0014143, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 1.9741475084229385, + "language_loss": 0.73916984, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76298964, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.656219244003296 + }, + { + "auxiliary_loss_clip": 0.01194201, + "auxiliary_loss_mlp": 0.01187786, + "balance_loss_clip": 1.00295949, + "balance_loss_mlp": 1.00153387, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.478963013957467, + "language_loss": 0.92407656, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94789648, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.538971185684204 + }, + { + "auxiliary_loss_clip": 0.01194197, + "auxiliary_loss_mlp": 0.01187782, + "balance_loss_clip": 1.00298393, + "balance_loss_mlp": 1.00143397, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.1324147772387496, + "language_loss": 0.82766438, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85148418, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.636322021484375 + }, + { + "auxiliary_loss_clip": 0.01194331, + "auxiliary_loss_mlp": 0.01187918, + "balance_loss_clip": 1.00309265, + "balance_loss_mlp": 1.00166583, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.218199996204417, + "language_loss": 0.77641559, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80023813, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.5828323364257812 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01187983, + "balance_loss_clip": 1.00289881, + "balance_loss_mlp": 1.00182533, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.6403791308806315, + "language_loss": 0.86660731, + "learning_rate": 3.87664903040738e-06, + "loss": 0.89042783, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.5357069969177246 + }, + { + "auxiliary_loss_clip": 0.01197728, + "auxiliary_loss_mlp": 0.01185969, + "balance_loss_clip": 1.00727546, + "balance_loss_mlp": 1.00057447, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8497868386552573, + "language_loss": 0.58587325, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60971022, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.1782314777374268 + }, + { + "auxiliary_loss_clip": 0.01194106, + "auxiliary_loss_mlp": 0.01187715, + "balance_loss_clip": 1.00285006, + "balance_loss_mlp": 1.00155807, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.8808702077369361, + "language_loss": 0.8047576, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82857579, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.608722448348999 + }, + { + "auxiliary_loss_clip": 0.01194043, + "auxiliary_loss_mlp": 0.01187758, + "balance_loss_clip": 1.00290418, + "balance_loss_mlp": 1.00160122, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 3.7767742987784643, + "language_loss": 0.80044466, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82426262, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.528935670852661 + }, + { + "auxiliary_loss_clip": 0.01194247, + "auxiliary_loss_mlp": 0.01187621, + "balance_loss_clip": 1.00298786, + "balance_loss_mlp": 1.00146389, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 3.1622555024757495, + "language_loss": 0.9590981, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98291677, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.4859838485717773 + }, + { + "auxiliary_loss_clip": 0.01194124, + "auxiliary_loss_mlp": 0.01187441, + "balance_loss_clip": 1.00289845, + "balance_loss_mlp": 1.00118816, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.646089628129281, + "language_loss": 0.77487195, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79868758, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.535522937774658 + }, + { + "auxiliary_loss_clip": 0.0119424, + "auxiliary_loss_mlp": 0.01187435, + "balance_loss_clip": 1.00311852, + "balance_loss_mlp": 1.00156403, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6112753897621555, + "language_loss": 0.76995397, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79377073, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.5723884105682373 + }, + { + "auxiliary_loss_clip": 0.01194123, + "auxiliary_loss_mlp": 0.01187256, + "balance_loss_clip": 1.00296807, + "balance_loss_mlp": 1.0013845, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.9919937023180325, + "language_loss": 0.81006807, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83388191, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.509366750717163 + }, + { + "auxiliary_loss_clip": 0.01194038, + "auxiliary_loss_mlp": 0.01187626, + "balance_loss_clip": 1.00291824, + "balance_loss_mlp": 1.00165927, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 2.1867212888187373, + "language_loss": 0.73684192, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.7606585, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.493070125579834 + }, + { + "auxiliary_loss_clip": 0.0119406, + "auxiliary_loss_mlp": 0.01187648, + "balance_loss_clip": 1.00286424, + "balance_loss_mlp": 1.00187206, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.8114089179766442, + "language_loss": 0.78903407, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81285119, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.5441527366638184 + }, + { + "auxiliary_loss_clip": 0.01194022, + "auxiliary_loss_mlp": 0.01187482, + "balance_loss_clip": 1.00293446, + "balance_loss_mlp": 1.0016104, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 3.57027771257863, + "language_loss": 0.81685126, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84066629, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.586691379547119 + }, + { + "auxiliary_loss_clip": 0.01194111, + "auxiliary_loss_mlp": 0.01187484, + "balance_loss_clip": 1.00290668, + "balance_loss_mlp": 1.0015173, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 8.151595816253028, + "language_loss": 0.83384037, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85765636, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.5627713203430176 + }, + { + "auxiliary_loss_clip": 0.01194104, + "auxiliary_loss_mlp": 0.0118694, + "balance_loss_clip": 1.0030086, + "balance_loss_mlp": 1.00106931, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.6200096280685026, + "language_loss": 0.74284244, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76665282, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.6436071395874023 + }, + { + "auxiliary_loss_clip": 0.0119397, + "auxiliary_loss_mlp": 0.01187606, + "balance_loss_clip": 1.00291407, + "balance_loss_mlp": 1.00183058, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 22.327270693626403, + "language_loss": 0.83272088, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85653669, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.548456907272339 + }, + { + "auxiliary_loss_clip": 0.01193872, + "auxiliary_loss_mlp": 0.0118718, + "balance_loss_clip": 1.00273967, + "balance_loss_mlp": 1.00111866, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 3.086300065617002, + "language_loss": 0.85339737, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87720788, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.543372869491577 + }, + { + "auxiliary_loss_clip": 0.01197643, + "auxiliary_loss_mlp": 0.01185726, + "balance_loss_clip": 1.0069505, + "balance_loss_mlp": 1.00033128, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8922055653330014, + "language_loss": 0.57208419, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59591788, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.22868275642395 + }, + { + "auxiliary_loss_clip": 0.01194279, + "auxiliary_loss_mlp": 0.01187885, + "balance_loss_clip": 1.00309026, + "balance_loss_mlp": 1.00182271, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.9780497375782216, + "language_loss": 0.88113189, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90495348, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.5145020484924316 + }, + { + "auxiliary_loss_clip": 0.01193901, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_clip": 1.00277591, + "balance_loss_mlp": 1.00128484, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.9565392347217694, + "language_loss": 0.86004949, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88386011, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.612321376800537 + }, + { + "auxiliary_loss_clip": 0.01193896, + "auxiliary_loss_mlp": 0.01187307, + "balance_loss_clip": 1.00276053, + "balance_loss_mlp": 1.00143647, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.724611800612039, + "language_loss": 0.88100708, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90481913, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.605966091156006 + }, + { + "auxiliary_loss_clip": 0.0119395, + "auxiliary_loss_mlp": 0.01186885, + "balance_loss_clip": 1.00298548, + "balance_loss_mlp": 1.00120509, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.049162727065583, + "language_loss": 0.8421973, + "learning_rate": 3.905676939184698e-06, + "loss": 0.8660056, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.5433804988861084 + }, + { + "auxiliary_loss_clip": 0.01194059, + "auxiliary_loss_mlp": 0.01187026, + "balance_loss_clip": 1.00289011, + "balance_loss_mlp": 1.00105953, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 7.36807884396971, + "language_loss": 0.86667866, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89048958, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.5277695655822754 + }, + { + "auxiliary_loss_clip": 0.01194053, + "auxiliary_loss_mlp": 0.01187233, + "balance_loss_clip": 1.00295043, + "balance_loss_mlp": 1.00126648, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 4.292414982322496, + "language_loss": 0.7596764, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78348923, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.6148829460144043 + }, + { + "auxiliary_loss_clip": 0.01193891, + "auxiliary_loss_mlp": 0.01187369, + "balance_loss_clip": 1.00279593, + "balance_loss_mlp": 1.00149775, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.0620738030412986, + "language_loss": 0.89798629, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92179894, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.4893834590911865 + }, + { + "auxiliary_loss_clip": 0.01193873, + "auxiliary_loss_mlp": 0.01187384, + "balance_loss_clip": 1.00280643, + "balance_loss_mlp": 1.00160861, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.908975956030854, + "language_loss": 0.80398303, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82779562, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 3.9663827419281006 + }, + { + "auxiliary_loss_clip": 0.01193899, + "auxiliary_loss_mlp": 0.01187381, + "balance_loss_clip": 1.00279319, + "balance_loss_mlp": 1.00150967, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 2.959814649720228, + "language_loss": 0.86407518, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88788795, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 4.023804426193237 + }, + { + "auxiliary_loss_clip": 0.01194082, + "auxiliary_loss_mlp": 0.01187159, + "balance_loss_clip": 1.00298727, + "balance_loss_mlp": 1.0014782, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.0787716891278123, + "language_loss": 0.74877119, + "learning_rate": 3.914578263220868e-06, + "loss": 0.7725836, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.5144734382629395 + }, + { + "auxiliary_loss_clip": 0.01193939, + "auxiliary_loss_mlp": 0.01187189, + "balance_loss_clip": 1.00285089, + "balance_loss_mlp": 1.00150871, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.616481066796504, + "language_loss": 0.91503692, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93884826, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 5.296837329864502 + }, + { + "auxiliary_loss_clip": 0.01197172, + "auxiliary_loss_mlp": 0.01185592, + "balance_loss_clip": 1.00662088, + "balance_loss_mlp": 1.00019729, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8730003426686965, + "language_loss": 0.62596607, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64979368, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.182307243347168 + }, + { + "auxiliary_loss_clip": 0.01194142, + "auxiliary_loss_mlp": 0.01187386, + "balance_loss_clip": 1.00304842, + "balance_loss_mlp": 1.00151539, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 1.9261278979470766, + "language_loss": 0.75519717, + "learning_rate": 3.918983198419573e-06, + "loss": 0.77901244, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.5908944606781006 + }, + { + "auxiliary_loss_clip": 0.01193988, + "auxiliary_loss_mlp": 0.01186894, + "balance_loss_clip": 1.00294363, + "balance_loss_mlp": 1.00111794, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.932229272439638, + "language_loss": 0.83304071, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85684949, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.507209539413452 + }, + { + "auxiliary_loss_clip": 0.01193922, + "auxiliary_loss_mlp": 0.01187352, + "balance_loss_clip": 1.00285566, + "balance_loss_mlp": 1.00157666, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.9683859381814974, + "language_loss": 0.78555334, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80936611, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.547484874725342 + }, + { + "auxiliary_loss_clip": 0.01197021, + "auxiliary_loss_mlp": 0.01185579, + "balance_loss_clip": 1.00648439, + "balance_loss_mlp": 1.00018489, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9615286750740951, + "language_loss": 0.64492464, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66875064, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 2.9661691188812256 + }, + { + "auxiliary_loss_clip": 0.01193972, + "auxiliary_loss_mlp": 0.01186956, + "balance_loss_clip": 1.00296652, + "balance_loss_mlp": 1.00137103, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 1.9420974592856912, + "language_loss": 0.82552195, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84933126, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.534430503845215 + }, + { + "auxiliary_loss_clip": 0.01193975, + "auxiliary_loss_mlp": 0.01187124, + "balance_loss_clip": 1.00287414, + "balance_loss_mlp": 1.00125337, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.1329942048706756, + "language_loss": 0.95403731, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97784829, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.546915292739868 + }, + { + "auxiliary_loss_clip": 0.01193873, + "auxiliary_loss_mlp": 0.01187295, + "balance_loss_clip": 1.00290847, + "balance_loss_mlp": 1.00161445, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.567010012435333, + "language_loss": 0.91802943, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94184113, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.5148887634277344 + }, + { + "auxiliary_loss_clip": 0.01193891, + "auxiliary_loss_mlp": 0.0118689, + "balance_loss_clip": 1.00294065, + "balance_loss_mlp": 1.00130475, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.0161332174334947, + "language_loss": 0.79546428, + "learning_rate": 3.92914567610317e-06, + "loss": 0.81927216, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.4911205768585205 + }, + { + "auxiliary_loss_clip": 0.01194044, + "auxiliary_loss_mlp": 0.01187049, + "balance_loss_clip": 1.00295103, + "balance_loss_mlp": 1.00127375, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 1.895078653562906, + "language_loss": 0.86681455, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89062548, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.5462348461151123 + }, + { + "auxiliary_loss_clip": 0.01193781, + "auxiliary_loss_mlp": 0.01187165, + "balance_loss_clip": 1.00280571, + "balance_loss_mlp": 1.00167489, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.0239400285127833, + "language_loss": 0.88817918, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91198862, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.5344440937042236 + }, + { + "auxiliary_loss_clip": 0.01193928, + "auxiliary_loss_mlp": 0.0118705, + "balance_loss_clip": 1.00283074, + "balance_loss_mlp": 1.00156009, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9959242355963298, + "language_loss": 0.80727154, + "learning_rate": 3.933452395729493e-06, + "loss": 0.83108127, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.520552396774292 + }, + { + "auxiliary_loss_clip": 0.01193996, + "auxiliary_loss_mlp": 0.01186875, + "balance_loss_clip": 1.00305319, + "balance_loss_mlp": 1.00138545, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.544304598869252, + "language_loss": 0.81471628, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83852506, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.578425884246826 + }, + { + "auxiliary_loss_clip": 0.01193978, + "auxiliary_loss_mlp": 0.01186571, + "balance_loss_clip": 1.00305808, + "balance_loss_mlp": 1.0014627, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.6083652632627659, + "language_loss": 0.77131379, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79511929, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.5305099487304688 + }, + { + "auxiliary_loss_clip": 0.01193762, + "auxiliary_loss_mlp": 0.01186994, + "balance_loss_clip": 1.00289488, + "balance_loss_mlp": 1.00150394, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.967383959018676, + "language_loss": 0.729743, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75355059, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.5792858600616455 + }, + { + "auxiliary_loss_clip": 0.01193812, + "auxiliary_loss_mlp": 0.01186904, + "balance_loss_clip": 1.0029006, + "balance_loss_mlp": 1.00131905, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 5.7841920438402505, + "language_loss": 0.82282782, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84663498, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.7335891723632812 + }, + { + "auxiliary_loss_clip": 0.01193913, + "auxiliary_loss_mlp": 0.01186513, + "balance_loss_clip": 1.003016, + "balance_loss_mlp": 1.00121427, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.8704819465752487, + "language_loss": 0.75328898, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77709329, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.5506138801574707 + }, + { + "auxiliary_loss_clip": 0.01193865, + "auxiliary_loss_mlp": 0.01186859, + "balance_loss_clip": 1.0028708, + "balance_loss_mlp": 1.00155973, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.0508706960337726, + "language_loss": 0.8089987, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83280599, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.5093586444854736 + }, + { + "auxiliary_loss_clip": 0.01193676, + "auxiliary_loss_mlp": 0.01186631, + "balance_loss_clip": 1.00273442, + "balance_loss_mlp": 1.00123656, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 1.862942434496446, + "language_loss": 0.81741381, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.84121692, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.5222902297973633 + }, + { + "auxiliary_loss_clip": 0.01193817, + "auxiliary_loss_mlp": 0.01187112, + "balance_loss_clip": 1.00287521, + "balance_loss_mlp": 1.0016228, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 5.732364399931975, + "language_loss": 0.94020534, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96401465, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.5457935333251953 + }, + { + "auxiliary_loss_clip": 0.01193828, + "auxiliary_loss_mlp": 0.01186928, + "balance_loss_clip": 1.00290442, + "balance_loss_mlp": 1.00162911, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4176595801671814, + "language_loss": 0.79036534, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81417286, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.474149227142334 + }, + { + "auxiliary_loss_clip": 0.01193948, + "auxiliary_loss_mlp": 0.01186772, + "balance_loss_clip": 1.00301182, + "balance_loss_mlp": 1.00128245, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.879143383158153, + "language_loss": 0.83338177, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85718894, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.5678277015686035 + }, + { + "auxiliary_loss_clip": 0.01196449, + "auxiliary_loss_mlp": 0.01184823, + "balance_loss_clip": 1.00612354, + "balance_loss_mlp": 1.00019193, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5391119059813447, + "language_loss": 0.73675585, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7605685, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.0983927249908447 + }, + { + "auxiliary_loss_clip": 0.01193945, + "auxiliary_loss_mlp": 0.01186928, + "balance_loss_clip": 1.00314307, + "balance_loss_mlp": 1.00162864, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.5591416579930555, + "language_loss": 0.81079006, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83459878, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.5987939834594727 + }, + { + "auxiliary_loss_clip": 0.01193799, + "auxiliary_loss_mlp": 0.01186873, + "balance_loss_clip": 1.00291765, + "balance_loss_mlp": 1.0013833, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2561848761495638, + "language_loss": 0.90334988, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92715663, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.5605251789093018 + }, + { + "auxiliary_loss_clip": 0.01196404, + "auxiliary_loss_mlp": 0.01184862, + "balance_loss_clip": 1.00609326, + "balance_loss_mlp": 1.00023031, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.893848385460359, + "language_loss": 0.59074169, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61455435, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.0916011333465576 + }, + { + "auxiliary_loss_clip": 0.01193934, + "auxiliary_loss_mlp": 0.01187314, + "balance_loss_clip": 1.00302601, + "balance_loss_mlp": 1.00163317, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 4.063426610456881, + "language_loss": 0.81336492, + "learning_rate": 3.954564194750784e-06, + "loss": 0.8371774, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.5740203857421875 + }, + { + "auxiliary_loss_clip": 0.0119367, + "auxiliary_loss_mlp": 0.01186565, + "balance_loss_clip": 1.00276232, + "balance_loss_mlp": 1.00126588, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.4388023298448287, + "language_loss": 0.78402102, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80782342, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.548100471496582 + }, + { + "auxiliary_loss_clip": 0.011938, + "auxiliary_loss_mlp": 0.01186612, + "balance_loss_clip": 1.00294352, + "balance_loss_mlp": 1.00150394, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.893867607164765, + "language_loss": 0.87587339, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89967752, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.52522611618042 + }, + { + "auxiliary_loss_clip": 0.01193828, + "auxiliary_loss_mlp": 0.01187139, + "balance_loss_clip": 1.00297296, + "balance_loss_mlp": 1.00174487, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.8187049168142124, + "language_loss": 0.86121899, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88502866, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.5135107040405273 + }, + { + "auxiliary_loss_clip": 0.01193628, + "auxiliary_loss_mlp": 0.01186447, + "balance_loss_clip": 1.00281382, + "balance_loss_mlp": 1.00124359, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.290270549494536, + "language_loss": 0.91856635, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94236708, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.534048318862915 + }, + { + "auxiliary_loss_clip": 0.01193792, + "auxiliary_loss_mlp": 0.01186605, + "balance_loss_clip": 1.00297236, + "balance_loss_mlp": 1.00149632, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.436005310824955, + "language_loss": 0.81648946, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84029341, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.4976768493652344 + }, + { + "auxiliary_loss_clip": 0.01193669, + "auxiliary_loss_mlp": 0.01187061, + "balance_loss_clip": 1.00282097, + "balance_loss_mlp": 1.00157118, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.735880392198246, + "language_loss": 0.93420941, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95801669, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.513103485107422 + }, + { + "auxiliary_loss_clip": 0.01193564, + "auxiliary_loss_mlp": 0.01186644, + "balance_loss_clip": 1.00275278, + "balance_loss_mlp": 1.00134528, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 2.0222620941801215, + "language_loss": 0.76037335, + "learning_rate": 3.964184363657625e-06, + "loss": 0.7841754, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.6597790718078613 + }, + { + "auxiliary_loss_clip": 0.01193742, + "auxiliary_loss_mlp": 0.01186544, + "balance_loss_clip": 1.00279665, + "balance_loss_mlp": 1.00114977, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 2.1758086290914656, + "language_loss": 0.93537265, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95917553, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 3.953016996383667 + }, + { + "auxiliary_loss_clip": 0.01193775, + "auxiliary_loss_mlp": 0.01187479, + "balance_loss_clip": 1.00286007, + "balance_loss_mlp": 1.00217962, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 2.727397293856445, + "language_loss": 0.88739026, + "learning_rate": 3.96690678709433e-06, + "loss": 0.91120273, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 3.97426700592041 + }, + { + "auxiliary_loss_clip": 0.01193608, + "auxiliary_loss_mlp": 0.01186591, + "balance_loss_clip": 1.00283301, + "balance_loss_mlp": 1.00148261, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.297202027689152, + "language_loss": 0.78992552, + "learning_rate": 3.968263694200355e-06, + "loss": 0.8137275, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.571420431137085 + }, + { + "auxiliary_loss_clip": 0.01196129, + "auxiliary_loss_mlp": 0.01184754, + "balance_loss_clip": 1.00587451, + "balance_loss_mlp": 1.00012231, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9147917842382408, + "language_loss": 0.6698246, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69363344, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 5.944110870361328 + }, + { + "auxiliary_loss_clip": 0.01193689, + "auxiliary_loss_mlp": 0.01186539, + "balance_loss_clip": 1.00291204, + "balance_loss_mlp": 1.00123978, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.2018703626802547, + "language_loss": 0.83679426, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86059654, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.4970438480377197 + }, + { + "auxiliary_loss_clip": 0.01193788, + "auxiliary_loss_mlp": 0.01186746, + "balance_loss_clip": 1.00310135, + "balance_loss_mlp": 1.00154233, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.1307398043769457, + "language_loss": 0.82357132, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84737664, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.5529823303222656 + }, + { + "auxiliary_loss_clip": 0.01193565, + "auxiliary_loss_mlp": 0.01186794, + "balance_loss_clip": 1.00275469, + "balance_loss_mlp": 1.00149512, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.459494307037123, + "language_loss": 0.80999017, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83379382, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.5164124965667725 + }, + { + "auxiliary_loss_clip": 0.01193609, + "auxiliary_loss_mlp": 0.01186625, + "balance_loss_clip": 1.00282121, + "balance_loss_mlp": 1.00151646, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 8.317483729812798, + "language_loss": 0.73587072, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75967306, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.517826557159424 + }, + { + "auxiliary_loss_clip": 0.01193622, + "auxiliary_loss_mlp": 0.01186559, + "balance_loss_clip": 1.00282168, + "balance_loss_mlp": 1.00135589, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.893123813833215, + "language_loss": 0.8761363, + "learning_rate": 3.976345626888605e-06, + "loss": 0.89993811, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.5728776454925537 + }, + { + "auxiliary_loss_clip": 0.01195973, + "auxiliary_loss_mlp": 0.011848, + "balance_loss_clip": 1.0057466, + "balance_loss_mlp": 1.0001688, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8198592345373766, + "language_loss": 0.66048247, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68429017, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.8842625617980957 + }, + { + "auxiliary_loss_clip": 0.01193755, + "auxiliary_loss_mlp": 0.01186646, + "balance_loss_clip": 1.0029726, + "balance_loss_mlp": 1.00144219, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.7877255222699997, + "language_loss": 0.7902264, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81403047, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.495903730392456 + }, + { + "auxiliary_loss_clip": 0.01193679, + "auxiliary_loss_mlp": 0.01186749, + "balance_loss_clip": 1.00292158, + "balance_loss_mlp": 1.0017364, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.3899695914949213, + "language_loss": 0.75844908, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78225338, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.50246524810791 + }, + { + "auxiliary_loss_clip": 0.01193693, + "auxiliary_loss_mlp": 0.01186559, + "balance_loss_clip": 1.00287497, + "balance_loss_mlp": 1.00125992, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.4090280459130717, + "language_loss": 0.83955508, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.8633576, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.495859384536743 + }, + { + "auxiliary_loss_clip": 0.01193708, + "auxiliary_loss_mlp": 0.01186911, + "balance_loss_clip": 1.00289679, + "balance_loss_mlp": 1.00151658, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 6.1141693664086185, + "language_loss": 0.84346551, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86727166, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.5188920497894287 + }, + { + "auxiliary_loss_clip": 0.01193498, + "auxiliary_loss_mlp": 0.01186932, + "balance_loss_clip": 1.00269914, + "balance_loss_mlp": 1.00182426, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.961945499432789, + "language_loss": 0.88898063, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91278487, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.5677473545074463 + }, + { + "auxiliary_loss_clip": 0.01193826, + "auxiliary_loss_mlp": 0.01186472, + "balance_loss_clip": 1.00307333, + "balance_loss_mlp": 1.0011735, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.7452173702542724, + "language_loss": 0.88672781, + "learning_rate": 3.985648090637122e-06, + "loss": 0.91053081, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.51054048538208 + }, + { + "auxiliary_loss_clip": 0.01193551, + "auxiliary_loss_mlp": 0.01186632, + "balance_loss_clip": 1.00280046, + "balance_loss_mlp": 1.00152349, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.8115510538780386, + "language_loss": 0.88752806, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91132987, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.563990831375122 + }, + { + "auxiliary_loss_clip": 0.01193427, + "auxiliary_loss_mlp": 0.01186378, + "balance_loss_clip": 1.00265908, + "balance_loss_mlp": 1.00136471, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.7870436880674658, + "language_loss": 0.88409972, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90789783, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.53324031829834 + }, + { + "auxiliary_loss_clip": 0.01193402, + "auxiliary_loss_mlp": 0.0118685, + "balance_loss_clip": 1.00265968, + "balance_loss_mlp": 1.00174141, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 4.300839715522084, + "language_loss": 0.91753376, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94133627, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.566241502761841 + }, + { + "auxiliary_loss_clip": 0.01193355, + "auxiliary_loss_mlp": 0.0118632, + "balance_loss_clip": 1.0026803, + "balance_loss_mlp": 1.00140238, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.3469596550293037, + "language_loss": 0.85670412, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.88050085, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.511317014694214 + }, + { + "auxiliary_loss_clip": 0.01193737, + "auxiliary_loss_mlp": 0.01186716, + "balance_loss_clip": 1.00301421, + "balance_loss_mlp": 1.00160766, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 3.3494206404442775, + "language_loss": 0.84303236, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86683691, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.4927685260772705 + }, + { + "auxiliary_loss_clip": 0.01193427, + "auxiliary_loss_mlp": 0.01186594, + "balance_loss_clip": 1.00265038, + "balance_loss_mlp": 1.00139022, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 2.0128385394759083, + "language_loss": 0.86624551, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89004576, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.5293469429016113 + }, + { + "auxiliary_loss_clip": 0.01193528, + "auxiliary_loss_mlp": 0.01186716, + "balance_loss_clip": 1.00281239, + "balance_loss_mlp": 1.00141752, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.1791496436166327, + "language_loss": 0.86546504, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88926744, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.528790235519409 + }, + { + "auxiliary_loss_clip": 0.01193323, + "auxiliary_loss_mlp": 0.01186054, + "balance_loss_clip": 1.00266027, + "balance_loss_mlp": 1.00132728, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.8462710162906928, + "language_loss": 0.61995339, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64374721, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.551722764968872 + }, + { + "auxiliary_loss_clip": 0.01193563, + "auxiliary_loss_mlp": 0.01186754, + "balance_loss_clip": 1.00287223, + "balance_loss_mlp": 1.00155044, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.9970494451876686, + "language_loss": 0.88927376, + "learning_rate": 3.997414244783595e-06, + "loss": 0.913077, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.554532051086426 + }, + { + "auxiliary_loss_clip": 0.01193637, + "auxiliary_loss_mlp": 0.01186612, + "balance_loss_clip": 1.00293064, + "balance_loss_mlp": 1.00169396, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 2.4880967140520456, + "language_loss": 0.84864664, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87244904, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.4973831176757812 + }, + { + "auxiliary_loss_clip": 0.01193441, + "auxiliary_loss_mlp": 0.01186338, + "balance_loss_clip": 1.00278723, + "balance_loss_mlp": 1.00151563, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 4.42575904650874, + "language_loss": 0.78620559, + "learning_rate": 4e-06, + "loss": 0.8100034, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.543992042541504 + }, + { + "auxiliary_loss_clip": 0.01193566, + "auxiliary_loss_mlp": 0.01186692, + "balance_loss_clip": 1.00282979, + "balance_loss_mlp": 1.00158393, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 2.12381593349138, + "language_loss": 0.82549828, + "learning_rate": 3.9999999620799e-06, + "loss": 0.84930086, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.525519847869873 + }, + { + "auxiliary_loss_clip": 0.01193267, + "auxiliary_loss_mlp": 0.01186612, + "balance_loss_clip": 1.00261557, + "balance_loss_mlp": 1.00150323, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.85024903707195, + "language_loss": 0.88021868, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90401745, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.513953685760498 + }, + { + "auxiliary_loss_clip": 0.01193626, + "auxiliary_loss_mlp": 0.01186529, + "balance_loss_clip": 1.00283122, + "balance_loss_mlp": 1.00132561, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.732839708343612, + "language_loss": 0.86655581, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.89035732, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.4966237545013428 + }, + { + "auxiliary_loss_clip": 0.01193589, + "auxiliary_loss_mlp": 0.01186399, + "balance_loss_clip": 1.00290549, + "balance_loss_mlp": 1.0012908, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 3.156153834473061, + "language_loss": 0.84567815, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86947805, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.4916887283325195 + }, + { + "auxiliary_loss_clip": 0.01193527, + "auxiliary_loss_mlp": 0.01186579, + "balance_loss_clip": 1.0029279, + "balance_loss_mlp": 1.00175655, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.724155509507617, + "language_loss": 0.88191295, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90571398, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.575688600540161 + }, + { + "auxiliary_loss_clip": 0.01193362, + "auxiliary_loss_mlp": 0.01186451, + "balance_loss_clip": 1.00270784, + "balance_loss_mlp": 1.00153363, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.066175759722438, + "language_loss": 0.78172123, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.8055194, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.478454828262329 + }, + { + "auxiliary_loss_clip": 0.01195559, + "auxiliary_loss_mlp": 0.0118395, + "balance_loss_clip": 1.00544572, + "balance_loss_mlp": 1.00008142, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.844547516949534, + "language_loss": 0.54904491, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57284003, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.2879433631896973 + }, + { + "auxiliary_loss_clip": 0.01193239, + "auxiliary_loss_mlp": 0.0118661, + "balance_loss_clip": 1.00258303, + "balance_loss_mlp": 1.00169265, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 4.6845746422649865, + "language_loss": 0.83302796, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85682642, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.5049684047698975 + }, + { + "auxiliary_loss_clip": 0.01193462, + "auxiliary_loss_mlp": 0.01186271, + "balance_loss_clip": 1.00275731, + "balance_loss_mlp": 1.00135386, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.82228662984118, + "language_loss": 0.88785559, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91165286, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.4941132068634033 + }, + { + "auxiliary_loss_clip": 0.01193395, + "auxiliary_loss_mlp": 0.01185964, + "balance_loss_clip": 1.00275588, + "balance_loss_mlp": 1.0010469, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2099906004871617, + "language_loss": 0.71349978, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73729342, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.6276021003723145 + }, + { + "auxiliary_loss_clip": 0.01193257, + "auxiliary_loss_mlp": 0.01186076, + "balance_loss_clip": 1.00273585, + "balance_loss_mlp": 1.00153971, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.0928213706062015, + "language_loss": 0.82058704, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84438038, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.531266212463379 + }, + { + "auxiliary_loss_clip": 0.01193359, + "auxiliary_loss_mlp": 0.01186207, + "balance_loss_clip": 1.00277734, + "balance_loss_mlp": 1.00138485, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 3.4918832319802102, + "language_loss": 0.83593118, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85972685, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 4.09687352180481 + }, + { + "auxiliary_loss_clip": 0.01193298, + "auxiliary_loss_mlp": 0.01186095, + "balance_loss_clip": 1.00259316, + "balance_loss_mlp": 1.0015595, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1570569896992304, + "language_loss": 0.82198894, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84578288, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 4.040046691894531 + }, + { + "auxiliary_loss_clip": 0.01193026, + "auxiliary_loss_mlp": 0.0118626, + "balance_loss_clip": 1.00244713, + "balance_loss_mlp": 1.00162888, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.14504286848616, + "language_loss": 0.8720333, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89582616, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.5943896770477295 + }, + { + "auxiliary_loss_clip": 0.01193369, + "auxiliary_loss_mlp": 0.01186263, + "balance_loss_clip": 1.00273323, + "balance_loss_mlp": 1.00144076, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.772484224572779, + "language_loss": 0.79485112, + "learning_rate": 3.999991467983491e-06, + "loss": 0.8186475, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 3.937455177307129 + }, + { + "auxiliary_loss_clip": 0.0119323, + "auxiliary_loss_mlp": 0.01185892, + "balance_loss_clip": 1.00263119, + "balance_loss_mlp": 1.00116587, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 5.910719589505816, + "language_loss": 0.77098584, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79477704, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 4.087200403213501 + }, + { + "auxiliary_loss_clip": 0.01193066, + "auxiliary_loss_mlp": 0.01185788, + "balance_loss_clip": 1.00250924, + "balance_loss_mlp": 1.00115633, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.8544529669373964, + "language_loss": 0.82373095, + "learning_rate": 3.999989041101011e-06, + "loss": 0.84751946, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.711785078048706 + }, + { + "auxiliary_loss_clip": 0.01193164, + "auxiliary_loss_mlp": 0.01186085, + "balance_loss_clip": 1.00264311, + "balance_loss_mlp": 1.00135887, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.9455949584952283, + "language_loss": 0.78682435, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81061685, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.5888307094573975 + }, + { + "auxiliary_loss_clip": 0.01193205, + "auxiliary_loss_mlp": 0.01186085, + "balance_loss_clip": 1.0026629, + "balance_loss_mlp": 1.00145411, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 2.252596567407602, + "language_loss": 0.90775919, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93155211, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.5676519870758057 + }, + { + "auxiliary_loss_clip": 0.01193412, + "auxiliary_loss_mlp": 0.01186559, + "balance_loss_clip": 1.00292933, + "balance_loss_mlp": 1.00192761, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 2.121481748758747, + "language_loss": 0.86514604, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88894576, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.5719239711761475 + }, + { + "auxiliary_loss_clip": 0.01193109, + "auxiliary_loss_mlp": 0.01186203, + "balance_loss_clip": 1.00260377, + "balance_loss_mlp": 1.00166667, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 3.840924644175694, + "language_loss": 0.86848301, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89227617, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.5254147052764893 + }, + { + "auxiliary_loss_clip": 0.01193247, + "auxiliary_loss_mlp": 0.0118653, + "balance_loss_clip": 1.00272989, + "balance_loss_mlp": 1.00170851, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.709345262466497, + "language_loss": 0.89556748, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91936529, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.5991032123565674 + }, + { + "auxiliary_loss_clip": 0.01193242, + "auxiliary_loss_mlp": 0.01185927, + "balance_loss_clip": 1.00277925, + "balance_loss_mlp": 1.00148654, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.1402270806607246, + "language_loss": 0.71364778, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73743939, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.565556049346924 + }, + { + "auxiliary_loss_clip": 0.01193234, + "auxiliary_loss_mlp": 0.01186327, + "balance_loss_clip": 1.00269651, + "balance_loss_mlp": 1.00159991, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.035630810524545, + "language_loss": 0.84824359, + "learning_rate": 3.999978158061963e-06, + "loss": 0.8720392, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.499145269393921 + }, + { + "auxiliary_loss_clip": 0.01193186, + "auxiliary_loss_mlp": 0.01186016, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.00128949, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 3.1774423221513977, + "language_loss": 0.90246159, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92625356, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.5097696781158447 + }, + { + "auxiliary_loss_clip": 0.01193406, + "auxiliary_loss_mlp": 0.01186281, + "balance_loss_clip": 1.00289702, + "balance_loss_mlp": 1.00155425, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.8438268877637576, + "language_loss": 0.79889166, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82268852, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.546574115753174 + }, + { + "auxiliary_loss_clip": 0.01193226, + "auxiliary_loss_mlp": 0.01186233, + "balance_loss_clip": 1.00272822, + "balance_loss_mlp": 1.00150585, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.395613144833393, + "language_loss": 0.80943274, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83322728, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.4782192707061768 + }, + { + "auxiliary_loss_clip": 0.0119327, + "auxiliary_loss_mlp": 0.01185969, + "balance_loss_clip": 1.00272226, + "balance_loss_mlp": 1.00143301, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 4.137759190526693, + "language_loss": 0.81246954, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83626199, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.5084054470062256 + }, + { + "auxiliary_loss_clip": 0.01193076, + "auxiliary_loss_mlp": 0.01186016, + "balance_loss_clip": 1.002617, + "balance_loss_mlp": 1.0013845, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.0629878123681618, + "language_loss": 0.93794394, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96173477, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.5063724517822266 + }, + { + "auxiliary_loss_clip": 0.01193007, + "auxiliary_loss_mlp": 0.01185967, + "balance_loss_clip": 1.00253701, + "balance_loss_mlp": 1.0013355, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.9315491964648734, + "language_loss": 0.84309399, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86688375, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.561760663986206 + }, + { + "auxiliary_loss_clip": 0.01193222, + "auxiliary_loss_mlp": 0.01186094, + "balance_loss_clip": 1.00281215, + "balance_loss_mlp": 1.00165367, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 1.8956127171030275, + "language_loss": 0.90477622, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92856944, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.56640362739563 + }, + { + "auxiliary_loss_clip": 0.01192961, + "auxiliary_loss_mlp": 0.01185731, + "balance_loss_clip": 1.00247324, + "balance_loss_mlp": 1.00129032, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 16.676036636735194, + "language_loss": 0.76066667, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78445357, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.541229486465454 + }, + { + "auxiliary_loss_clip": 0.01192962, + "auxiliary_loss_mlp": 0.01185555, + "balance_loss_clip": 1.00255871, + "balance_loss_mlp": 1.00101924, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.3395044948386925, + "language_loss": 0.90640473, + "learning_rate": 3.999958705152843e-06, + "loss": 0.93018997, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.555647850036621 + }, + { + "auxiliary_loss_clip": 0.01194823, + "auxiliary_loss_mlp": 0.01184116, + "balance_loss_clip": 1.0049212, + "balance_loss_mlp": 1.0002476, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7361948388218769, + "language_loss": 0.57902533, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60281473, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.1748135089874268 + }, + { + "auxiliary_loss_clip": 0.01192976, + "auxiliary_loss_mlp": 0.01186309, + "balance_loss_clip": 1.00260854, + "balance_loss_mlp": 1.00167727, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8157634871563877, + "language_loss": 0.86289966, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88669258, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.5956828594207764 + }, + { + "auxiliary_loss_clip": 0.0119315, + "auxiliary_loss_mlp": 0.01185363, + "balance_loss_clip": 1.00269628, + "balance_loss_mlp": 1.00092244, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.232687809085833, + "language_loss": 0.77482712, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79861224, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.545841932296753 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.01186072, + "balance_loss_clip": 1.00259078, + "balance_loss_mlp": 1.00153553, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.1675416760099098, + "language_loss": 0.80734825, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83113849, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.590801239013672 + }, + { + "auxiliary_loss_clip": 0.01193056, + "auxiliary_loss_mlp": 0.01185968, + "balance_loss_clip": 1.00268102, + "balance_loss_mlp": 1.00133681, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.280799757326731, + "language_loss": 0.70181245, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72560263, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.602172613143921 + }, + { + "auxiliary_loss_clip": 0.01193096, + "auxiliary_loss_mlp": 0.01185796, + "balance_loss_clip": 1.00270331, + "balance_loss_mlp": 1.00154638, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.278380756779221, + "language_loss": 0.82681108, + "learning_rate": 3.999942323804607e-06, + "loss": 0.8506, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.5295186042785645 + }, + { + "auxiliary_loss_clip": 0.01193203, + "auxiliary_loss_mlp": 0.01185989, + "balance_loss_clip": 1.00267529, + "balance_loss_mlp": 1.0014528, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 2.210384758622074, + "language_loss": 0.79056758, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81435955, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.568702459335327 + }, + { + "auxiliary_loss_clip": 0.0119287, + "auxiliary_loss_mlp": 0.01185527, + "balance_loss_clip": 1.00255406, + "balance_loss_mlp": 1.00118148, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 3.3111848661045875, + "language_loss": 0.77745837, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80124235, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.574293375015259 + }, + { + "auxiliary_loss_clip": 0.0119318, + "auxiliary_loss_mlp": 0.01186078, + "balance_loss_clip": 1.00278032, + "balance_loss_mlp": 1.00163782, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 3.382451930726234, + "language_loss": 0.85489953, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87869209, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.5337438583374023 + }, + { + "auxiliary_loss_clip": 0.01193028, + "auxiliary_loss_mlp": 0.01185778, + "balance_loss_clip": 1.00270259, + "balance_loss_mlp": 1.0014329, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.569112749308371, + "language_loss": 0.89327461, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.9170627, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.4988014698028564 + }, + { + "auxiliary_loss_clip": 0.01192945, + "auxiliary_loss_mlp": 0.01186079, + "balance_loss_clip": 1.00255311, + "balance_loss_mlp": 1.00163889, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.5087038739471894, + "language_loss": 0.70981514, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73360538, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.5437400341033936 + }, + { + "auxiliary_loss_clip": 0.01192855, + "auxiliary_loss_mlp": 0.01186086, + "balance_loss_clip": 1.00249946, + "balance_loss_mlp": 1.00154984, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.8026300311870944, + "language_loss": 0.91631281, + "learning_rate": 3.999923212288192e-06, + "loss": 0.94010222, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.546917676925659 + }, + { + "auxiliary_loss_clip": 0.01193021, + "auxiliary_loss_mlp": 0.01185945, + "balance_loss_clip": 1.00267601, + "balance_loss_mlp": 1.00159943, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 2.7595358217232393, + "language_loss": 0.65932119, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68311083, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.497326135635376 + }, + { + "auxiliary_loss_clip": 0.01192938, + "auxiliary_loss_mlp": 0.01185661, + "balance_loss_clip": 1.00254607, + "balance_loss_mlp": 1.00112534, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.3000191817297817, + "language_loss": 0.91982973, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94361573, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.545281171798706 + }, + { + "auxiliary_loss_clip": 0.01192833, + "auxiliary_loss_mlp": 0.01185629, + "balance_loss_clip": 1.00246048, + "balance_loss_mlp": 1.00128412, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.3570815246952073, + "language_loss": 0.81942415, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84320873, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.519244909286499 + }, + { + "auxiliary_loss_clip": 0.01192976, + "auxiliary_loss_mlp": 0.01185594, + "balance_loss_clip": 1.00254345, + "balance_loss_mlp": 1.00115323, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 2.4895821907986315, + "language_loss": 0.81144953, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83523524, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.533946990966797 + }, + { + "auxiliary_loss_clip": 0.01192864, + "auxiliary_loss_mlp": 0.01186157, + "balance_loss_clip": 1.00251245, + "balance_loss_mlp": 1.00171614, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 2.4044235673031267, + "language_loss": 0.67259419, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69638437, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.4979093074798584 + }, + { + "auxiliary_loss_clip": 0.01192873, + "auxiliary_loss_mlp": 0.01185559, + "balance_loss_clip": 1.00258851, + "balance_loss_mlp": 1.00140464, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 1.9888863957563299, + "language_loss": 0.85987401, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88365835, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 3.958622694015503 + }, + { + "auxiliary_loss_clip": 0.0119308, + "auxiliary_loss_mlp": 0.0118584, + "balance_loss_clip": 1.00278664, + "balance_loss_mlp": 1.0013994, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 1.9291293030836272, + "language_loss": 0.81383336, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83762258, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 3.9581212997436523 + }, + { + "auxiliary_loss_clip": 0.01192996, + "auxiliary_loss_mlp": 0.01186086, + "balance_loss_clip": 1.00253844, + "balance_loss_mlp": 1.00145435, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 15.897331365776521, + "language_loss": 0.86401033, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88780117, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.5660946369171143 + }, + { + "auxiliary_loss_clip": 0.01192917, + "auxiliary_loss_mlp": 0.01185726, + "balance_loss_clip": 1.00257552, + "balance_loss_mlp": 1.00147569, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.653354167078566, + "language_loss": 0.92772591, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95151234, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 3.908003330230713 + }, + { + "auxiliary_loss_clip": 0.01192981, + "auxiliary_loss_mlp": 0.01185653, + "balance_loss_clip": 1.00262439, + "balance_loss_mlp": 1.00140333, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.199657588659243, + "language_loss": 0.78931415, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81310052, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 3.9568402767181396 + }, + { + "auxiliary_loss_clip": 0.01192915, + "auxiliary_loss_mlp": 0.01185893, + "balance_loss_clip": 1.00262833, + "balance_loss_mlp": 1.00164318, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.4059115726755658, + "language_loss": 0.82015431, + "learning_rate": 3.999881083743795e-06, + "loss": 0.8439424, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.5165250301361084 + }, + { + "auxiliary_loss_clip": 0.01192881, + "auxiliary_loss_mlp": 0.01185606, + "balance_loss_clip": 1.00250459, + "balance_loss_mlp": 1.001261, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 4.127586732968756, + "language_loss": 0.88620627, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90999115, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.5517590045928955 + }, + { + "auxiliary_loss_clip": 0.01192906, + "auxiliary_loss_mlp": 0.01185592, + "balance_loss_clip": 1.00259721, + "balance_loss_mlp": 1.0014379, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.504555647117101, + "language_loss": 0.84215045, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86593544, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.515916109085083 + }, + { + "auxiliary_loss_clip": 0.01192943, + "auxiliary_loss_mlp": 0.01185611, + "balance_loss_clip": 1.00260949, + "balance_loss_mlp": 1.001266, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 3.6084417667894817, + "language_loss": 0.94293129, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96671683, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.4749345779418945 + }, + { + "auxiliary_loss_clip": 0.01192794, + "auxiliary_loss_mlp": 0.01185474, + "balance_loss_clip": 1.00253057, + "balance_loss_mlp": 1.00132, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.3786211261859735, + "language_loss": 0.77498609, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79876882, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.5912022590637207 + }, + { + "auxiliary_loss_clip": 0.01192858, + "auxiliary_loss_mlp": 0.01185826, + "balance_loss_clip": 1.00252175, + "balance_loss_mlp": 1.00176692, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.837230066962206, + "language_loss": 0.87419236, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89797914, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.5083627700805664 + }, + { + "auxiliary_loss_clip": 0.01193058, + "auxiliary_loss_mlp": 0.01185429, + "balance_loss_clip": 1.00273788, + "balance_loss_mlp": 1.00117922, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.3317353656022197, + "language_loss": 0.81928355, + "learning_rate": 3.999854236904925e-06, + "loss": 0.84306836, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.522592306137085 + }, + { + "auxiliary_loss_clip": 0.01192862, + "auxiliary_loss_mlp": 0.01185992, + "balance_loss_clip": 1.00258613, + "balance_loss_mlp": 1.00155115, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 2.2312253351252846, + "language_loss": 0.82359535, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84738386, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.5334174633026123 + }, + { + "auxiliary_loss_clip": 0.01192934, + "auxiliary_loss_mlp": 0.01185685, + "balance_loss_clip": 1.002635, + "balance_loss_mlp": 1.00162625, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.2034138665134053, + "language_loss": 0.84590876, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86969495, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.482217788696289 + }, + { + "auxiliary_loss_clip": 0.01193034, + "auxiliary_loss_mlp": 0.01185638, + "balance_loss_clip": 1.00274253, + "balance_loss_mlp": 1.00138807, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.6270599361525306, + "language_loss": 0.94126838, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96505511, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.489496946334839 + }, + { + "auxiliary_loss_clip": 0.01192828, + "auxiliary_loss_mlp": 0.01186022, + "balance_loss_clip": 1.00251162, + "balance_loss_mlp": 1.00148642, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.9251233216943975, + "language_loss": 0.94376612, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96755457, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.509971857070923 + }, + { + "auxiliary_loss_clip": 0.01194852, + "auxiliary_loss_mlp": 0.0118344, + "balance_loss_clip": 1.00506425, + "balance_loss_mlp": 1.00033438, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1427052141316367, + "language_loss": 0.54924238, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57302535, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.175905227661133 + }, + { + "auxiliary_loss_clip": 0.01192899, + "auxiliary_loss_mlp": 0.01185509, + "balance_loss_clip": 1.00262022, + "balance_loss_mlp": 1.00135493, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.3922987776144775, + "language_loss": 0.76855361, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79233772, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.607160806655884 + }, + { + "auxiliary_loss_clip": 0.01192762, + "auxiliary_loss_mlp": 0.01185632, + "balance_loss_clip": 1.00262201, + "balance_loss_mlp": 1.00147784, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 1.9750314806518348, + "language_loss": 0.80868912, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83247304, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.6164913177490234 + }, + { + "auxiliary_loss_clip": 0.01192819, + "auxiliary_loss_mlp": 0.01185744, + "balance_loss_clip": 1.00255561, + "balance_loss_mlp": 1.00178003, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.932725461393368, + "language_loss": 0.86364144, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88742709, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.516998291015625 + }, + { + "auxiliary_loss_clip": 0.01192797, + "auxiliary_loss_mlp": 0.011854, + "balance_loss_clip": 1.00254214, + "balance_loss_mlp": 1.0015316, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.70298584786943, + "language_loss": 0.95920539, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98298728, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.5390045642852783 + }, + { + "auxiliary_loss_clip": 0.01192671, + "auxiliary_loss_mlp": 0.01185747, + "balance_loss_clip": 1.00239515, + "balance_loss_mlp": 1.00159216, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.0480950783809044, + "language_loss": 0.79962492, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82340908, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.511528253555298 + }, + { + "auxiliary_loss_clip": 0.01192805, + "auxiliary_loss_mlp": 0.01185794, + "balance_loss_clip": 1.00264955, + "balance_loss_mlp": 1.00154376, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.303291420319759, + "language_loss": 0.80317092, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82695687, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.5774197578430176 + }, + { + "auxiliary_loss_clip": 0.01192923, + "auxiliary_loss_mlp": 0.01185695, + "balance_loss_clip": 1.00268281, + "balance_loss_mlp": 1.00144529, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.959810562869345, + "language_loss": 0.84952748, + "learning_rate": 3.999792353123774e-06, + "loss": 0.87331361, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.644193410873413 + }, + { + "auxiliary_loss_clip": 0.01192802, + "auxiliary_loss_mlp": 0.01185458, + "balance_loss_clip": 1.00249243, + "balance_loss_mlp": 1.00130367, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.2322582666262742, + "language_loss": 0.76289952, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78668213, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.4789907932281494 + }, + { + "auxiliary_loss_clip": 0.01192752, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_clip": 1.00258005, + "balance_loss_mlp": 1.00148523, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 3.6875176520959734, + "language_loss": 0.83794349, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.8617264, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.52203369140625 + }, + { + "auxiliary_loss_clip": 0.01192756, + "auxiliary_loss_mlp": 0.01185698, + "balance_loss_clip": 1.00266719, + "balance_loss_mlp": 1.00163889, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.2533139382195144, + "language_loss": 0.83843482, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86221933, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.519420623779297 + }, + { + "auxiliary_loss_clip": 0.01192852, + "auxiliary_loss_mlp": 0.01185545, + "balance_loss_clip": 1.00282753, + "balance_loss_mlp": 1.00158143, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.4219911988120475, + "language_loss": 0.86528373, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88906765, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.530897617340088 + }, + { + "auxiliary_loss_clip": 0.01192782, + "auxiliary_loss_mlp": 0.01185222, + "balance_loss_clip": 1.00265765, + "balance_loss_mlp": 1.00125849, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 2.107434667028551, + "language_loss": 0.72729552, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.75107557, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.50569748878479 + }, + { + "auxiliary_loss_clip": 0.01192679, + "auxiliary_loss_mlp": 0.01185783, + "balance_loss_clip": 1.00250602, + "balance_loss_mlp": 1.00162864, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 2.794240365223389, + "language_loss": 0.77521181, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79899639, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.530647039413452 + }, + { + "auxiliary_loss_clip": 0.01192672, + "auxiliary_loss_mlp": 0.01185656, + "balance_loss_clip": 1.00249195, + "balance_loss_mlp": 1.00169206, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.9788026355439856, + "language_loss": 0.86959839, + "learning_rate": 3.999751211379863e-06, + "loss": 0.8933816, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.496190309524536 + }, + { + "auxiliary_loss_clip": 0.01192738, + "auxiliary_loss_mlp": 0.01185503, + "balance_loss_clip": 1.00259531, + "balance_loss_mlp": 1.00144362, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.433062757204346, + "language_loss": 0.82371968, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84750211, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.4736862182617188 + }, + { + "auxiliary_loss_clip": 0.01192779, + "auxiliary_loss_mlp": 0.0118533, + "balance_loss_clip": 1.00264597, + "balance_loss_mlp": 1.0013665, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.543793741743634, + "language_loss": 0.77056134, + "learning_rate": 3.99973877411558e-06, + "loss": 0.7943424, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.484957695007324 + }, + { + "auxiliary_loss_clip": 0.01192596, + "auxiliary_loss_mlp": 0.01185418, + "balance_loss_clip": 1.00254273, + "balance_loss_mlp": 1.00174046, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.277784576071648, + "language_loss": 0.87554449, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89932466, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.5122745037078857 + }, + { + "auxiliary_loss_clip": 0.01192817, + "auxiliary_loss_mlp": 0.0118581, + "balance_loss_clip": 1.00264716, + "balance_loss_mlp": 1.00175118, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.449053501467291, + "language_loss": 0.80907178, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83285797, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.4993655681610107 + }, + { + "auxiliary_loss_clip": 0.01192652, + "auxiliary_loss_mlp": 0.01185309, + "balance_loss_clip": 1.00248218, + "balance_loss_mlp": 1.00144041, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 3.5322960092133737, + "language_loss": 0.93463987, + "learning_rate": 3.999719549492551e-06, + "loss": 0.9584195, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.479546308517456 + }, + { + "auxiliary_loss_clip": 0.01192625, + "auxiliary_loss_mlp": 0.01185456, + "balance_loss_clip": 1.00255585, + "balance_loss_mlp": 1.001683, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.433091643148098, + "language_loss": 0.87668383, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90046465, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.491103172302246 + }, + { + "auxiliary_loss_clip": 0.01192698, + "auxiliary_loss_mlp": 0.01185553, + "balance_loss_clip": 1.00261998, + "balance_loss_mlp": 1.00158954, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.9282470927563204, + "language_loss": 0.76500088, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78878337, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.5010197162628174 + }, + { + "auxiliary_loss_clip": 0.01192634, + "auxiliary_loss_mlp": 0.01185293, + "balance_loss_clip": 1.00253487, + "balance_loss_mlp": 1.00152016, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 2.020410017193249, + "language_loss": 0.78914678, + "learning_rate": 3.999699642403449e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.5192103385925293 + }, + { + "auxiliary_loss_clip": 0.01192581, + "auxiliary_loss_mlp": 0.01185381, + "balance_loss_clip": 1.00243092, + "balance_loss_mlp": 1.00151324, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 6.124304142041308, + "language_loss": 0.93878657, + "learning_rate": 3.99969285504912e-06, + "loss": 0.9625662, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 5.3456127643585205 + }, + { + "auxiliary_loss_clip": 0.01192546, + "auxiliary_loss_mlp": 0.01185091, + "balance_loss_clip": 1.00250494, + "balance_loss_mlp": 1.00131774, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 3.045002744075773, + "language_loss": 0.83854127, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86231756, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.6122305393218994 + }, + { + "auxiliary_loss_clip": 0.01192761, + "auxiliary_loss_mlp": 0.0118535, + "balance_loss_clip": 1.0026474, + "balance_loss_mlp": 1.00138652, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 3.1885670088133953, + "language_loss": 0.87027276, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89405388, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 3.8820695877075195 + }, + { + "auxiliary_loss_clip": 0.01192488, + "auxiliary_loss_mlp": 0.01185304, + "balance_loss_clip": 1.00239015, + "balance_loss_mlp": 1.00143552, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 1.9256763410089859, + "language_loss": 0.82818007, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85195798, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 3.887955665588379 + }, + { + "auxiliary_loss_clip": 0.01195221, + "auxiliary_loss_mlp": 0.0118417, + "balance_loss_clip": 1.00548959, + "balance_loss_mlp": 1.00106454, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8822730945119107, + "language_loss": 0.59806514, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62185907, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.145981550216675 + }, + { + "auxiliary_loss_clip": 0.01192627, + "auxiliary_loss_mlp": 0.01184904, + "balance_loss_clip": 1.00265384, + "balance_loss_mlp": 1.00122666, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 2.3022109708619594, + "language_loss": 0.87240934, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89618468, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01192541, + "auxiliary_loss_mlp": 0.01185609, + "balance_loss_clip": 1.00247955, + "balance_loss_mlp": 1.00174069, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.1920803223172465, + "language_loss": 0.83688462, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86066616, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.556422710418701 + }, + { + "auxiliary_loss_clip": 0.01192505, + "auxiliary_loss_mlp": 0.0118551, + "balance_loss_clip": 1.00256526, + "balance_loss_mlp": 1.00183225, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 2.7088269795274957, + "language_loss": 0.9651562, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98893636, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.464899778366089 + }, + { + "auxiliary_loss_clip": 0.01192608, + "auxiliary_loss_mlp": 0.01185398, + "balance_loss_clip": 1.00259006, + "balance_loss_mlp": 1.00172007, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.246195704228398, + "language_loss": 0.82839847, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85217851, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.5034258365631104 + }, + { + "auxiliary_loss_clip": 0.01192455, + "auxiliary_loss_mlp": 0.01185106, + "balance_loss_clip": 1.00257874, + "balance_loss_mlp": 1.00142837, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.6149738907450648, + "language_loss": 0.81228721, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83606285, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.5189156532287598 + }, + { + "auxiliary_loss_clip": 0.0119239, + "auxiliary_loss_mlp": 0.01185141, + "balance_loss_clip": 1.00249887, + "balance_loss_mlp": 1.00146341, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 2.0060730331669396, + "language_loss": 0.81200182, + "learning_rate": 3.999620810979295e-06, + "loss": 0.8357771, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.4897360801696777 + }, + { + "auxiliary_loss_clip": 0.01192499, + "auxiliary_loss_mlp": 0.01185057, + "balance_loss_clip": 1.00248265, + "balance_loss_mlp": 1.00128353, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 3.3150665028919044, + "language_loss": 0.86401176, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88778734, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.5137877464294434 + }, + { + "auxiliary_loss_clip": 0.01192196, + "auxiliary_loss_mlp": 0.01184953, + "balance_loss_clip": 1.00236678, + "balance_loss_mlp": 1.00156105, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.2115716991174263, + "language_loss": 0.82128799, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84505939, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.4861533641815186 + }, + { + "auxiliary_loss_clip": 0.011924, + "auxiliary_loss_mlp": 0.01184926, + "balance_loss_clip": 1.00246024, + "balance_loss_mlp": 1.00143886, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 4.6016376625531, + "language_loss": 0.75600326, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77977645, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.509197473526001 + }, + { + "auxiliary_loss_clip": 0.01192478, + "auxiliary_loss_mlp": 0.01184918, + "balance_loss_clip": 1.00259769, + "balance_loss_mlp": 1.00133562, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 2.6537537300504055, + "language_loss": 0.79554069, + "learning_rate": 3.999589870212761e-06, + "loss": 0.81931466, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.571908712387085 + }, + { + "auxiliary_loss_clip": 0.01192541, + "auxiliary_loss_mlp": 0.01184857, + "balance_loss_clip": 1.0027014, + "balance_loss_mlp": 1.00136948, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 2.084163135144594, + "language_loss": 0.8662864, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89006042, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.5191993713378906 + }, + { + "auxiliary_loss_clip": 0.01192403, + "auxiliary_loss_mlp": 0.01185154, + "balance_loss_clip": 1.00255275, + "balance_loss_mlp": 1.00147653, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.6060012214988384, + "language_loss": 0.80846411, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83223969, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.488247871398926 + }, + { + "auxiliary_loss_clip": 0.01192434, + "auxiliary_loss_mlp": 0.01185243, + "balance_loss_clip": 1.00247812, + "balance_loss_mlp": 1.0016607, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 3.1533931572128737, + "language_loss": 0.85390389, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87768066, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.477473258972168 + }, + { + "auxiliary_loss_clip": 0.01192462, + "auxiliary_loss_mlp": 0.01185255, + "balance_loss_clip": 1.00252867, + "balance_loss_mlp": 1.0015775, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.090293644862129, + "language_loss": 0.8249734, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84875059, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.5220978260040283 + }, + { + "auxiliary_loss_clip": 0.01192466, + "auxiliary_loss_mlp": 0.01184792, + "balance_loss_clip": 1.00256968, + "balance_loss_mlp": 1.00130463, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.721938317466761, + "language_loss": 0.83261448, + "learning_rate": 3.999549488202358e-06, + "loss": 0.85638702, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.5602502822875977 + }, + { + "auxiliary_loss_clip": 0.01192453, + "auxiliary_loss_mlp": 0.01184839, + "balance_loss_clip": 1.00259233, + "balance_loss_mlp": 1.00125694, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.3708567966114855, + "language_loss": 0.82200611, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84577906, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.473001003265381 + }, + { + "auxiliary_loss_clip": 0.01192658, + "auxiliary_loss_mlp": 0.01184961, + "balance_loss_clip": 1.00276423, + "balance_loss_mlp": 1.00156999, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 1.9777275069760778, + "language_loss": 0.79133058, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81510681, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.5228326320648193 + }, + { + "auxiliary_loss_clip": 0.01192585, + "auxiliary_loss_mlp": 0.0118528, + "balance_loss_clip": 1.00267816, + "balance_loss_mlp": 1.00160182, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 1.9597114057550096, + "language_loss": 0.87139297, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89517164, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.495661497116089 + }, + { + "auxiliary_loss_clip": 0.01192483, + "auxiliary_loss_mlp": 0.01185333, + "balance_loss_clip": 1.00263977, + "balance_loss_mlp": 1.00194168, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.4463124585385514, + "language_loss": 0.72839224, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75217044, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.528541088104248 + }, + { + "auxiliary_loss_clip": 0.01192418, + "auxiliary_loss_mlp": 0.0118517, + "balance_loss_clip": 1.00251448, + "balance_loss_mlp": 1.00158787, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.739355620812549, + "language_loss": 0.79271472, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81649065, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.541306734085083 + }, + { + "auxiliary_loss_clip": 0.01192357, + "auxiliary_loss_mlp": 0.01185026, + "balance_loss_clip": 1.00251269, + "balance_loss_mlp": 1.00153947, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 1.8654722861323252, + "language_loss": 0.93576837, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95954216, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.561131477355957 + }, + { + "auxiliary_loss_clip": 0.01192511, + "auxiliary_loss_mlp": 0.0118522, + "balance_loss_clip": 1.00265956, + "balance_loss_mlp": 1.00154257, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 4.093455291888333, + "language_loss": 0.72946, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75323731, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.6069791316986084 + }, + { + "auxiliary_loss_clip": 0.01192332, + "auxiliary_loss_mlp": 0.01185186, + "balance_loss_clip": 1.00243783, + "balance_loss_mlp": 1.00179434, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 1.886050998602862, + "language_loss": 0.81548893, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83926415, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.541031837463379 + }, + { + "auxiliary_loss_clip": 0.01192486, + "auxiliary_loss_mlp": 0.01184803, + "balance_loss_clip": 1.00259936, + "balance_loss_mlp": 1.00122023, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.2212887247251465, + "language_loss": 0.67414844, + "learning_rate": 3.999472023754499e-06, + "loss": 0.69792128, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.5717716217041016 + }, + { + "auxiliary_loss_clip": 0.01192586, + "auxiliary_loss_mlp": 0.01184604, + "balance_loss_clip": 1.00280094, + "balance_loss_mlp": 1.00111723, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.15866330748464, + "language_loss": 0.80230308, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82607496, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.4770760536193848 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01184983, + "balance_loss_clip": 1.00248945, + "balance_loss_mlp": 1.00159144, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.1778801990080483, + "language_loss": 0.91433316, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93810689, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.463949680328369 + }, + { + "auxiliary_loss_clip": 0.01192395, + "auxiliary_loss_mlp": 0.01185039, + "balance_loss_clip": 1.00256228, + "balance_loss_mlp": 1.00164795, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.5315527677967427, + "language_loss": 0.94174761, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96552199, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.4913365840911865 + }, + { + "auxiliary_loss_clip": 0.01192327, + "auxiliary_loss_mlp": 0.01185144, + "balance_loss_clip": 1.00246668, + "balance_loss_mlp": 1.00165677, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.919225093760083, + "language_loss": 0.77397996, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79775465, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.5175774097442627 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01184614, + "balance_loss_clip": 1.00262225, + "balance_loss_mlp": 1.00112736, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.772858350416748, + "language_loss": 0.86600745, + "learning_rate": 3.999426334228518e-06, + "loss": 0.8897776, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.4872961044311523 + }, + { + "auxiliary_loss_clip": 0.01192384, + "auxiliary_loss_mlp": 0.01184773, + "balance_loss_clip": 1.00257325, + "balance_loss_mlp": 1.00166762, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.8625885067260795, + "language_loss": 0.90451813, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92828971, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.4834091663360596 + }, + { + "auxiliary_loss_clip": 0.0119247, + "auxiliary_loss_mlp": 0.01185188, + "balance_loss_clip": 1.00269055, + "balance_loss_mlp": 1.00189161, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9853550251032464, + "language_loss": 0.84379458, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86757118, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.4840126037597656 + }, + { + "auxiliary_loss_clip": 0.01192297, + "auxiliary_loss_mlp": 0.01184562, + "balance_loss_clip": 1.00255299, + "balance_loss_mlp": 1.00126576, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.6988762649747047, + "language_loss": 0.66787255, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69164109, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.60333251953125 + }, + { + "auxiliary_loss_clip": 0.01192421, + "auxiliary_loss_mlp": 0.01184666, + "balance_loss_clip": 1.00257576, + "balance_loss_mlp": 1.00137019, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 1.893488786083806, + "language_loss": 0.77287406, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79664493, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.6366608142852783 + }, + { + "auxiliary_loss_clip": 0.01192476, + "auxiliary_loss_mlp": 0.01184575, + "balance_loss_clip": 1.00267005, + "balance_loss_mlp": 1.0013746, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.099539226612571, + "language_loss": 0.81734312, + "learning_rate": 3.999378749241506e-06, + "loss": 0.84111363, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 5.355368614196777 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01184622, + "balance_loss_clip": 1.00275898, + "balance_loss_mlp": 1.00132537, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.8793415224255565, + "language_loss": 0.89038807, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91416085, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.5357062816619873 + }, + { + "auxiliary_loss_clip": 0.01192249, + "auxiliary_loss_mlp": 0.01184817, + "balance_loss_clip": 1.00250816, + "balance_loss_mlp": 1.00161648, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.2221455327003676, + "language_loss": 0.79939687, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82316756, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 3.9633395671844482 + }, + { + "auxiliary_loss_clip": 0.01192417, + "auxiliary_loss_mlp": 0.01184561, + "balance_loss_clip": 1.00256848, + "balance_loss_mlp": 1.0011692, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7056969375092121, + "language_loss": 0.77181137, + "learning_rate": 3.999349288446696e-06, + "loss": 0.7955811, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.5028560161590576 + }, + { + "auxiliary_loss_clip": 0.0119237, + "auxiliary_loss_mlp": 0.01184618, + "balance_loss_clip": 1.00258684, + "balance_loss_mlp": 1.00132179, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 4.443677635222095, + "language_loss": 0.91686255, + "learning_rate": 3.99933931655021e-06, + "loss": 0.94063246, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.4765193462371826 + }, + { + "auxiliary_loss_clip": 0.01192201, + "auxiliary_loss_mlp": 0.0118464, + "balance_loss_clip": 1.00254536, + "balance_loss_mlp": 1.00182009, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.9292882752406055, + "language_loss": 0.92180032, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94556868, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.5153956413269043 + }, + { + "auxiliary_loss_clip": 0.01192455, + "auxiliary_loss_mlp": 0.01184585, + "balance_loss_clip": 1.00265455, + "balance_loss_mlp": 1.00128925, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 1.839705840931025, + "language_loss": 0.83218062, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85595095, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.533123254776001 + }, + { + "auxiliary_loss_clip": 0.01192333, + "auxiliary_loss_mlp": 0.01184613, + "balance_loss_clip": 1.00260866, + "balance_loss_mlp": 1.00122166, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.600677701286774, + "language_loss": 0.69653356, + "learning_rate": 3.999308945971392e-06, + "loss": 0.720303, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.5736780166625977 + }, + { + "auxiliary_loss_clip": 0.0119482, + "auxiliary_loss_mlp": 0.01183493, + "balance_loss_clip": 1.00541353, + "balance_loss_mlp": 1.00115013, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8791316505378949, + "language_loss": 0.61591083, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63969398, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.1388492584228516 + }, + { + "auxiliary_loss_clip": 0.01192215, + "auxiliary_loss_mlp": 0.01184407, + "balance_loss_clip": 1.00257361, + "balance_loss_mlp": 1.00139666, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.1421060052415535, + "language_loss": 0.83799595, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86176217, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.5206806659698486 + }, + { + "auxiliary_loss_clip": 0.01192276, + "auxiliary_loss_mlp": 0.01184683, + "balance_loss_clip": 1.0025816, + "balance_loss_mlp": 1.0013864, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 3.0160377157282903, + "language_loss": 0.79635513, + "learning_rate": 3.999277893066632e-06, + "loss": 0.82012475, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.4523708820343018 + }, + { + "auxiliary_loss_clip": 0.01192169, + "auxiliary_loss_mlp": 0.01184713, + "balance_loss_clip": 1.00245476, + "balance_loss_mlp": 1.0016073, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 1.848620717117736, + "language_loss": 0.84231645, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86608529, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.0119225, + "auxiliary_loss_mlp": 0.01184518, + "balance_loss_clip": 1.00248289, + "balance_loss_mlp": 1.00131738, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.2945685634984945, + "language_loss": 0.69788611, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72165376, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.517850637435913 + }, + { + "auxiliary_loss_clip": 0.011922, + "auxiliary_loss_mlp": 0.01184344, + "balance_loss_clip": 1.00248957, + "balance_loss_mlp": 1.00123894, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 3.248367657279619, + "language_loss": 0.85337281, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87713826, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.5133275985717773 + }, + { + "auxiliary_loss_clip": 0.0119224, + "auxiliary_loss_mlp": 0.0118477, + "balance_loss_clip": 1.00254512, + "balance_loss_mlp": 1.00147331, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 5.721944182014499, + "language_loss": 0.82174861, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84551877, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.5039150714874268 + }, + { + "auxiliary_loss_clip": 0.0119465, + "auxiliary_loss_mlp": 0.01182891, + "balance_loss_clip": 1.00531399, + "balance_loss_mlp": 1.0005486, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9026334915477385, + "language_loss": 0.65434444, + "learning_rate": 3.999224621974381e-06, + "loss": 0.6781199, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.099015712738037 + }, + { + "auxiliary_loss_clip": 0.01192045, + "auxiliary_loss_mlp": 0.01184109, + "balance_loss_clip": 1.00244045, + "balance_loss_mlp": 1.00109887, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9001439944175997, + "language_loss": 0.79866344, + "learning_rate": 3.999213740321906e-06, + "loss": 0.82242501, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.5147454738616943 + }, + { + "auxiliary_loss_clip": 0.01192082, + "auxiliary_loss_mlp": 0.01184237, + "balance_loss_clip": 1.00252318, + "balance_loss_mlp": 1.00122643, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 9.712091972414482, + "language_loss": 0.82993525, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85369843, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.5043036937713623 + }, + { + "auxiliary_loss_clip": 0.0119207, + "auxiliary_loss_mlp": 0.0118412, + "balance_loss_clip": 1.00251675, + "balance_loss_mlp": 1.00110984, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 1.9809435390611285, + "language_loss": 0.82126099, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84502292, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.5974934101104736 + }, + { + "auxiliary_loss_clip": 0.01192045, + "auxiliary_loss_mlp": 0.01184373, + "balance_loss_clip": 1.00242853, + "balance_loss_mlp": 1.00117242, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4105492900345205, + "language_loss": 0.81896991, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.8427341, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.5403757095336914 + }, + { + "auxiliary_loss_clip": 0.01192371, + "auxiliary_loss_mlp": 0.01184471, + "balance_loss_clip": 1.00270808, + "balance_loss_mlp": 1.00136542, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8119406585786246, + "language_loss": 0.82009995, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84386837, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.4997551441192627 + }, + { + "auxiliary_loss_clip": 0.01192091, + "auxiliary_loss_mlp": 0.01184292, + "balance_loss_clip": 1.00253773, + "balance_loss_mlp": 1.0012821, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9186342846967428, + "language_loss": 0.84494841, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86871231, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.5958147048950195 + }, + { + "auxiliary_loss_clip": 0.01192164, + "auxiliary_loss_mlp": 0.0118435, + "balance_loss_clip": 1.00254333, + "balance_loss_mlp": 1.0011487, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 1.9912850555095896, + "language_loss": 0.84593892, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86970407, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01192241, + "auxiliary_loss_mlp": 0.01184456, + "balance_loss_clip": 1.00257301, + "balance_loss_mlp": 1.00125504, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8861542848666082, + "language_loss": 0.79870224, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82246917, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.503026247024536 + }, + { + "auxiliary_loss_clip": 0.01192063, + "auxiliary_loss_mlp": 0.01184209, + "balance_loss_clip": 1.00247979, + "balance_loss_mlp": 1.00110376, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.053957329128184, + "language_loss": 0.78778207, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81154478, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.479421615600586 + }, + { + "auxiliary_loss_clip": 0.01191984, + "auxiliary_loss_mlp": 0.01184187, + "balance_loss_clip": 1.00247276, + "balance_loss_mlp": 1.00108123, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.5490652623932313, + "language_loss": 0.87364274, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89740443, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.48832106590271 + }, + { + "auxiliary_loss_clip": 0.01192113, + "auxiliary_loss_mlp": 0.01184295, + "balance_loss_clip": 1.00257778, + "balance_loss_mlp": 1.00118971, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.6880294506966473, + "language_loss": 0.79489964, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81866366, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.6105406284332275 + }, + { + "auxiliary_loss_clip": 0.01192124, + "auxiliary_loss_mlp": 0.01184351, + "balance_loss_clip": 1.00256371, + "balance_loss_mlp": 1.00115061, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.8536172925440995, + "language_loss": 0.85815251, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88191724, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.561945676803589 + }, + { + "auxiliary_loss_clip": 0.01194074, + "auxiliary_loss_mlp": 0.01182566, + "balance_loss_clip": 1.00492167, + "balance_loss_mlp": 1.0002234, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7161306781468633, + "language_loss": 0.49885154, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52261794, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.1760196685791016 + }, + { + "auxiliary_loss_clip": 0.01192009, + "auxiliary_loss_mlp": 0.01184079, + "balance_loss_clip": 1.00252962, + "balance_loss_mlp": 1.0011642, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.2874087804333962, + "language_loss": 0.80871487, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83247566, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.525151014328003 + }, + { + "auxiliary_loss_clip": 0.01192026, + "auxiliary_loss_mlp": 0.01184499, + "balance_loss_clip": 1.00244641, + "balance_loss_mlp": 1.00139391, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.6724240036161047, + "language_loss": 0.76292777, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78669298, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.4704701900482178 + }, + { + "auxiliary_loss_clip": 0.01191998, + "auxiliary_loss_mlp": 0.0118395, + "balance_loss_clip": 1.00255609, + "balance_loss_mlp": 1.00093985, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 1.9415463226198484, + "language_loss": 0.81935918, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84311867, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.5404343605041504 + }, + { + "auxiliary_loss_clip": 0.01191929, + "auxiliary_loss_mlp": 0.0118452, + "balance_loss_clip": 1.00246239, + "balance_loss_mlp": 1.00141478, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 3.2479054600395587, + "language_loss": 0.91013092, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93389541, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.508625030517578 + }, + { + "auxiliary_loss_clip": 0.01192063, + "auxiliary_loss_mlp": 0.01184201, + "balance_loss_clip": 1.00248528, + "balance_loss_mlp": 1.00119066, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.80074744536678, + "language_loss": 0.79654014, + "learning_rate": 3.999017153588724e-06, + "loss": 0.82030278, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.50244402885437 + }, + { + "auxiliary_loss_clip": 0.01192026, + "auxiliary_loss_mlp": 0.01184153, + "balance_loss_clip": 1.00258911, + "balance_loss_mlp": 1.0011425, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.8909001368832816, + "language_loss": 0.81691957, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84068131, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.530766010284424 + }, + { + "auxiliary_loss_clip": 0.01193837, + "auxiliary_loss_mlp": 0.01182559, + "balance_loss_clip": 1.00479794, + "balance_loss_mlp": 1.00021672, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9054896943377655, + "language_loss": 0.69304556, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71680957, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.249371290206909 + }, + { + "auxiliary_loss_clip": 0.01192016, + "auxiliary_loss_mlp": 0.01183956, + "balance_loss_clip": 1.00254047, + "balance_loss_mlp": 1.00123239, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.572170028782098, + "language_loss": 0.82879865, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85255843, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.5091605186462402 + }, + { + "auxiliary_loss_clip": 0.01191964, + "auxiliary_loss_mlp": 0.01183961, + "balance_loss_clip": 1.00250411, + "balance_loss_mlp": 1.00095117, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.4274224633338175, + "language_loss": 0.87338847, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89714772, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.521165370941162 + }, + { + "auxiliary_loss_clip": 0.01191829, + "auxiliary_loss_mlp": 0.01183774, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.00105011, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 1.977819775046585, + "language_loss": 0.84978813, + "learning_rate": 3.998955164701281e-06, + "loss": 0.8735441, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 3.8619656562805176 + }, + { + "auxiliary_loss_clip": 0.01192175, + "auxiliary_loss_mlp": 0.01184775, + "balance_loss_clip": 1.00267506, + "balance_loss_mlp": 1.00166965, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 1.9298351524285453, + "language_loss": 0.81737447, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84114397, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 4.015764951705933 + }, + { + "auxiliary_loss_clip": 0.01191878, + "auxiliary_loss_mlp": 0.01184014, + "balance_loss_clip": 1.00245929, + "balance_loss_mlp": 1.00119507, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.355017263262047, + "language_loss": 0.8690629, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89282191, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.518272638320923 + }, + { + "auxiliary_loss_clip": 0.01191961, + "auxiliary_loss_mlp": 0.01183893, + "balance_loss_clip": 1.0026387, + "balance_loss_mlp": 1.00107312, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.2760417762505147, + "language_loss": 0.81006575, + "learning_rate": 3.998917061758087e-06, + "loss": 0.83382422, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 5.221538782119751 + }, + { + "auxiliary_loss_clip": 0.01193472, + "auxiliary_loss_mlp": 0.01181769, + "balance_loss_clip": 1.00448871, + "balance_loss_mlp": 1.00018978, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7859213093693892, + "language_loss": 0.60094833, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62470078, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.215336322784424 + }, + { + "auxiliary_loss_clip": 0.01191942, + "auxiliary_loss_mlp": 0.01184003, + "balance_loss_clip": 1.00253725, + "balance_loss_mlp": 1.00127876, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 4.421524741544399, + "language_loss": 0.85924697, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88300639, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.534487724304199 + }, + { + "auxiliary_loss_clip": 0.01191927, + "auxiliary_loss_mlp": 0.01184035, + "balance_loss_clip": 1.00258875, + "balance_loss_mlp": 1.00131106, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 2.444969662817159, + "language_loss": 0.75293428, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77669388, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.4992570877075195 + }, + { + "auxiliary_loss_clip": 0.01192006, + "auxiliary_loss_mlp": 0.01184381, + "balance_loss_clip": 1.00258136, + "balance_loss_mlp": 1.0014658, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.097588715948761, + "language_loss": 0.92463994, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94840378, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.481919765472412 + }, + { + "auxiliary_loss_clip": 0.01191937, + "auxiliary_loss_mlp": 0.01184236, + "balance_loss_clip": 1.00258517, + "balance_loss_mlp": 1.00141704, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.9533298011172362, + "language_loss": 0.9022795, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92604125, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.4873087406158447 + }, + { + "auxiliary_loss_clip": 0.01191733, + "auxiliary_loss_mlp": 0.01184294, + "balance_loss_clip": 1.00240636, + "balance_loss_mlp": 1.00157022, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 1.9096468664612167, + "language_loss": 0.74896222, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77272248, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0119186, + "auxiliary_loss_mlp": 0.01183923, + "balance_loss_clip": 1.00248694, + "balance_loss_mlp": 1.00119901, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.7640605822452375, + "language_loss": 0.77917266, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80293047, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.5070748329162598 + }, + { + "auxiliary_loss_clip": 0.01191906, + "auxiliary_loss_mlp": 0.01184167, + "balance_loss_clip": 1.00255919, + "balance_loss_mlp": 1.00144267, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.8613328588273705, + "language_loss": 0.7673986, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79115927, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.515780210494995 + }, + { + "auxiliary_loss_clip": 0.01192016, + "auxiliary_loss_mlp": 0.01184281, + "balance_loss_clip": 1.00257742, + "balance_loss_mlp": 1.00136614, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.864672023974696, + "language_loss": 0.85186458, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87562752, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.4760231971740723 + }, + { + "auxiliary_loss_clip": 0.01192007, + "auxiliary_loss_mlp": 0.01184035, + "balance_loss_clip": 1.00263929, + "balance_loss_mlp": 1.00111985, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 2.011042567429536, + "language_loss": 0.76412261, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78788298, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.5416319370269775 + }, + { + "auxiliary_loss_clip": 0.0119195, + "auxiliary_loss_mlp": 0.01184176, + "balance_loss_clip": 1.00254679, + "balance_loss_mlp": 1.00135636, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.714547220524833, + "language_loss": 0.82056469, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84432596, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.629833221435547 + }, + { + "auxiliary_loss_clip": 0.01192052, + "auxiliary_loss_mlp": 0.01183826, + "balance_loss_clip": 1.00268912, + "balance_loss_mlp": 1.0011977, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.8988973257113382, + "language_loss": 0.76149881, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78525752, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.5566093921661377 + }, + { + "auxiliary_loss_clip": 0.01191874, + "auxiliary_loss_mlp": 0.01183963, + "balance_loss_clip": 1.00248289, + "balance_loss_mlp": 1.00114369, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 2.039254952904037, + "language_loss": 0.8329283, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85668671, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.6346211433410645 + }, + { + "auxiliary_loss_clip": 0.01191963, + "auxiliary_loss_mlp": 0.0118388, + "balance_loss_clip": 1.00258398, + "balance_loss_mlp": 1.00106096, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.7213067198670522, + "language_loss": 0.71302807, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73678654, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.5085396766662598 + }, + { + "auxiliary_loss_clip": 0.01191822, + "auxiliary_loss_mlp": 0.01183949, + "balance_loss_clip": 1.00249588, + "balance_loss_mlp": 1.00112987, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 2.178546185638901, + "language_loss": 0.72546625, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74922395, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.4883627891540527 + }, + { + "auxiliary_loss_clip": 0.01191981, + "auxiliary_loss_mlp": 0.01184209, + "balance_loss_clip": 1.00263608, + "balance_loss_mlp": 1.00138974, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.0866468088135486, + "language_loss": 0.81412935, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83789122, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.4785501956939697 + }, + { + "auxiliary_loss_clip": 0.01191945, + "auxiliary_loss_mlp": 0.01184129, + "balance_loss_clip": 1.00260568, + "balance_loss_mlp": 1.00130939, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.608225804857961, + "language_loss": 0.90712798, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93088865, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.570892810821533 + }, + { + "auxiliary_loss_clip": 0.01191899, + "auxiliary_loss_mlp": 0.01183848, + "balance_loss_clip": 1.00255549, + "balance_loss_mlp": 1.00112355, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.1665351108998356, + "language_loss": 0.87867337, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90243089, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.516533851623535 + }, + { + "auxiliary_loss_clip": 0.01191958, + "auxiliary_loss_mlp": 0.0118402, + "balance_loss_clip": 1.00259852, + "balance_loss_mlp": 1.00129569, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 2.000942012482113, + "language_loss": 0.71657312, + "learning_rate": 3.998659901655851e-06, + "loss": 0.74033284, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.5667388439178467 + }, + { + "auxiliary_loss_clip": 0.01191852, + "auxiliary_loss_mlp": 0.01184139, + "balance_loss_clip": 1.00262761, + "balance_loss_mlp": 1.00141478, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 1.5500156055622882, + "language_loss": 0.86294699, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88670689, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.555680990219116 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01183974, + "balance_loss_clip": 1.00265622, + "balance_loss_mlp": 1.00153613, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.146852506972038, + "language_loss": 0.83052999, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85428905, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.5152523517608643 + }, + { + "auxiliary_loss_clip": 0.01191794, + "auxiliary_loss_mlp": 0.01183984, + "balance_loss_clip": 1.00253356, + "balance_loss_mlp": 1.00135517, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.9361313064979238, + "language_loss": 0.68347538, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70723319, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.5874695777893066 + }, + { + "auxiliary_loss_clip": 0.01191865, + "auxiliary_loss_mlp": 0.0118467, + "balance_loss_clip": 1.0025233, + "balance_loss_mlp": 1.00165939, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 2.0838568009945515, + "language_loss": 0.75233436, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77609968, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.5030617713928223 + }, + { + "auxiliary_loss_clip": 0.01191662, + "auxiliary_loss_mlp": 0.01183949, + "balance_loss_clip": 1.002442, + "balance_loss_mlp": 1.00151086, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.0648036451945044, + "language_loss": 0.84658289, + "learning_rate": 3.998587680434526e-06, + "loss": 0.87033892, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.470452070236206 + }, + { + "auxiliary_loss_clip": 0.01191885, + "auxiliary_loss_mlp": 0.01183814, + "balance_loss_clip": 1.00249386, + "balance_loss_mlp": 1.00128055, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.3551980114335254, + "language_loss": 0.88832682, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91208375, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.4992752075195312 + }, + { + "auxiliary_loss_clip": 0.01191879, + "auxiliary_loss_mlp": 0.01183861, + "balance_loss_clip": 1.00259006, + "balance_loss_mlp": 1.00113702, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.2612245978454704, + "language_loss": 0.82098484, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84474224, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.5548388957977295 + }, + { + "auxiliary_loss_clip": 0.01191962, + "auxiliary_loss_mlp": 0.01183814, + "balance_loss_clip": 1.00256228, + "balance_loss_mlp": 1.00109017, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 1.9611086825275206, + "language_loss": 0.83385056, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85760832, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.553614616394043 + }, + { + "auxiliary_loss_clip": 0.01191775, + "auxiliary_loss_mlp": 0.0118399, + "balance_loss_clip": 1.00253284, + "balance_loss_mlp": 1.00145698, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.7618976086840843, + "language_loss": 0.84564507, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86940265, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.4996957778930664 + }, + { + "auxiliary_loss_clip": 0.01191762, + "auxiliary_loss_mlp": 0.01183858, + "balance_loss_clip": 1.00254095, + "balance_loss_mlp": 1.00142026, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.9919596111576245, + "language_loss": 0.9255361, + "learning_rate": 3.998513564547216e-06, + "loss": 0.9492923, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.513399362564087 + }, + { + "auxiliary_loss_clip": 0.01191792, + "auxiliary_loss_mlp": 0.01183801, + "balance_loss_clip": 1.00259376, + "balance_loss_mlp": 1.00136292, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.3011975435198866, + "language_loss": 0.83521271, + "learning_rate": 3.998498514015987e-06, + "loss": 0.85896868, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.4781363010406494 + }, + { + "auxiliary_loss_clip": 0.01191835, + "auxiliary_loss_mlp": 0.01184123, + "balance_loss_clip": 1.00258529, + "balance_loss_mlp": 1.00168514, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.0923260771466214, + "language_loss": 0.9120627, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93582225, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.503164291381836 + }, + { + "auxiliary_loss_clip": 0.01192739, + "auxiliary_loss_mlp": 0.01181829, + "balance_loss_clip": 1.00403857, + "balance_loss_mlp": 1.00024915, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8948709995958165, + "language_loss": 0.67885613, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70260179, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.13651180267334 + }, + { + "auxiliary_loss_clip": 0.01191825, + "auxiliary_loss_mlp": 0.01183845, + "balance_loss_clip": 1.00259292, + "balance_loss_mlp": 1.00150251, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.900609079481828, + "language_loss": 0.88975859, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91351533, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.01191905, + "auxiliary_loss_mlp": 0.0118401, + "balance_loss_clip": 1.00266373, + "balance_loss_mlp": 1.00128615, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 1.9139463901003642, + "language_loss": 0.67270708, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69646621, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.537874937057495 + }, + { + "auxiliary_loss_clip": 0.01192654, + "auxiliary_loss_mlp": 0.01181968, + "balance_loss_clip": 1.0039959, + "balance_loss_mlp": 1.00038862, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.840379555159362, + "language_loss": 0.60769224, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.63143849, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.168806791305542 + }, + { + "auxiliary_loss_clip": 0.01192642, + "auxiliary_loss_mlp": 0.01181996, + "balance_loss_clip": 1.00394201, + "balance_loss_mlp": 1.00041664, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0187343385448107, + "language_loss": 0.57692665, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.60067302, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 5.72552752494812 + }, + { + "auxiliary_loss_clip": 0.01191841, + "auxiliary_loss_mlp": 0.01183732, + "balance_loss_clip": 1.00265634, + "balance_loss_mlp": 1.0012939, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.213928172709714, + "language_loss": 0.87438262, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89813834, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 4.130311965942383 + }, + { + "auxiliary_loss_clip": 0.01191489, + "auxiliary_loss_mlp": 0.0118363, + "balance_loss_clip": 1.00239468, + "balance_loss_mlp": 1.00138259, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7908258679680527, + "language_loss": 0.71300399, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73675513, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.501237392425537 + }, + { + "auxiliary_loss_clip": 0.01191695, + "auxiliary_loss_mlp": 0.01183431, + "balance_loss_clip": 1.00246859, + "balance_loss_mlp": 1.00099301, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 38.59231657506957, + "language_loss": 0.93425441, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95800567, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.567903995513916 + }, + { + "auxiliary_loss_clip": 0.01191468, + "auxiliary_loss_mlp": 0.0118339, + "balance_loss_clip": 1.00236869, + "balance_loss_mlp": 1.00095201, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.1910557257657426, + "language_loss": 0.81253207, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83628058, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.5872645378112793 + }, + { + "auxiliary_loss_clip": 0.01191639, + "auxiliary_loss_mlp": 0.01183503, + "balance_loss_clip": 1.00244415, + "balance_loss_mlp": 1.00116086, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.7091807989923598, + "language_loss": 0.82185906, + "learning_rate": 3.998327956604666e-06, + "loss": 0.8456105, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.4730827808380127 + }, + { + "auxiliary_loss_clip": 0.01191882, + "auxiliary_loss_mlp": 0.01183696, + "balance_loss_clip": 1.00269735, + "balance_loss_mlp": 1.00106788, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.716789661607055, + "language_loss": 0.85543978, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87919557, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.5346951484680176 + }, + { + "auxiliary_loss_clip": 0.01191674, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_clip": 1.00253582, + "balance_loss_mlp": 1.00129139, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 2.750481594785493, + "language_loss": 0.84683168, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87058377, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.4988608360290527 + }, + { + "auxiliary_loss_clip": 0.01191562, + "auxiliary_loss_mlp": 0.01183422, + "balance_loss_clip": 1.00240612, + "balance_loss_mlp": 1.00107968, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.7147104964666489, + "language_loss": 0.85648561, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.88023543, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.5234153270721436 + }, + { + "auxiliary_loss_clip": 0.01191716, + "auxiliary_loss_mlp": 0.01183332, + "balance_loss_clip": 1.0025121, + "balance_loss_mlp": 1.0010848, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.5418459256593966, + "language_loss": 0.90958464, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93333513, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.534672498703003 + }, + { + "auxiliary_loss_clip": 0.0119256, + "auxiliary_loss_mlp": 0.01182323, + "balance_loss_clip": 1.00374651, + "balance_loss_mlp": 1.00074375, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8806762262480273, + "language_loss": 0.63758409, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66133296, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.2246763706207275 + }, + { + "auxiliary_loss_clip": 0.01191562, + "auxiliary_loss_mlp": 0.01183335, + "balance_loss_clip": 1.0025264, + "balance_loss_mlp": 1.00118303, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.0567504055289505, + "language_loss": 0.74932194, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77307081, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.703742504119873 + }, + { + "auxiliary_loss_clip": 0.01191719, + "auxiliary_loss_mlp": 0.01183521, + "balance_loss_clip": 1.00256658, + "balance_loss_mlp": 1.00098801, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.06655602453556, + "language_loss": 0.72668862, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75044107, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.658369541168213 + }, + { + "auxiliary_loss_clip": 0.011925, + "auxiliary_loss_mlp": 0.01182289, + "balance_loss_clip": 1.00368667, + "balance_loss_mlp": 1.00070906, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9103592546941847, + "language_loss": 0.65505147, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67879933, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.1433613300323486 + }, + { + "auxiliary_loss_clip": 0.01192484, + "auxiliary_loss_mlp": 0.01182328, + "balance_loss_clip": 1.00365663, + "balance_loss_mlp": 1.00074887, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9848451184260363, + "language_loss": 0.58843559, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61218375, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.92429518699646 + }, + { + "auxiliary_loss_clip": 0.01191684, + "auxiliary_loss_mlp": 0.0118341, + "balance_loss_clip": 1.00257933, + "balance_loss_mlp": 1.00097251, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.9488703441670865, + "language_loss": 0.91596305, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93971401, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.5301263332366943 + }, + { + "auxiliary_loss_clip": 0.01191479, + "auxiliary_loss_mlp": 0.01183804, + "balance_loss_clip": 1.00232625, + "balance_loss_mlp": 1.00127125, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 2.657339253082114, + "language_loss": 0.66474521, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68849802, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.6380414962768555 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.011836, + "balance_loss_clip": 1.00249958, + "balance_loss_mlp": 1.00125742, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 2.1438963418484676, + "language_loss": 0.77753401, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80128616, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.5625226497650146 + }, + { + "auxiliary_loss_clip": 0.01191547, + "auxiliary_loss_mlp": 0.01183458, + "balance_loss_clip": 1.00246143, + "balance_loss_mlp": 1.00140142, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.0036618904951258, + "language_loss": 0.88219082, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90594089, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.4902257919311523 + }, + { + "auxiliary_loss_clip": 0.01191649, + "auxiliary_loss_mlp": 0.01183624, + "balance_loss_clip": 1.00250006, + "balance_loss_mlp": 1.00128102, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 1.961505321765452, + "language_loss": 0.84349817, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86725092, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.5445168018341064 + }, + { + "auxiliary_loss_clip": 0.01191585, + "auxiliary_loss_mlp": 0.01183592, + "balance_loss_clip": 1.00260425, + "balance_loss_mlp": 1.00124979, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 2.0310473823367476, + "language_loss": 0.83011663, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85386837, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.594860315322876 + }, + { + "auxiliary_loss_clip": 0.01191667, + "auxiliary_loss_mlp": 0.01183733, + "balance_loss_clip": 1.00256443, + "balance_loss_mlp": 1.00129533, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.9364272161865368, + "language_loss": 0.79605293, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81980687, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.531259298324585 + }, + { + "auxiliary_loss_clip": 0.01191628, + "auxiliary_loss_mlp": 0.01183286, + "balance_loss_clip": 1.00251639, + "balance_loss_mlp": 1.00113463, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.3820708285541032, + "language_loss": 0.87185884, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89560795, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.47061824798584 + }, + { + "auxiliary_loss_clip": 0.01192338, + "auxiliary_loss_mlp": 0.01182066, + "balance_loss_clip": 1.00358152, + "balance_loss_mlp": 1.00048661, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9130378642015496, + "language_loss": 0.55899632, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58274031, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.26767897605896 + }, + { + "auxiliary_loss_clip": 0.01191487, + "auxiliary_loss_mlp": 0.01183169, + "balance_loss_clip": 1.00246501, + "balance_loss_mlp": 1.00111294, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.114434960445859, + "language_loss": 0.82284546, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84659207, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.513984441757202 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01183105, + "balance_loss_clip": 1.0024817, + "balance_loss_mlp": 1.00095296, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2205076163065627, + "language_loss": 0.76986015, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79360616, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.5067968368530273 + }, + { + "auxiliary_loss_clip": 0.01191567, + "auxiliary_loss_mlp": 0.01183245, + "balance_loss_clip": 1.00246894, + "balance_loss_mlp": 1.00109339, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.9151571921233224, + "language_loss": 0.95135361, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97510171, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.5713717937469482 + }, + { + "auxiliary_loss_clip": 0.01191545, + "auxiliary_loss_mlp": 0.01183222, + "balance_loss_clip": 1.00238061, + "balance_loss_mlp": 1.00078464, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 4.050222090082188, + "language_loss": 0.88891423, + "learning_rate": 3.997959335640013e-06, + "loss": 0.91266191, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.507352828979492 + }, + { + "auxiliary_loss_clip": 0.01191574, + "auxiliary_loss_mlp": 0.01183246, + "balance_loss_clip": 1.00258207, + "balance_loss_mlp": 1.00109446, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 4.18304112353666, + "language_loss": 0.89083803, + "learning_rate": 3.997941708816791e-06, + "loss": 0.91458619, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.4852232933044434 + }, + { + "auxiliary_loss_clip": 0.01191544, + "auxiliary_loss_mlp": 0.01183288, + "balance_loss_clip": 1.0024513, + "balance_loss_mlp": 1.00123119, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.670834327369219, + "language_loss": 0.85892493, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88267326, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.520853042602539 + }, + { + "auxiliary_loss_clip": 0.01191508, + "auxiliary_loss_mlp": 0.01183251, + "balance_loss_clip": 1.00253177, + "balance_loss_mlp": 1.00119495, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.274103678371437, + "language_loss": 0.91311085, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93685853, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.4901621341705322 + }, + { + "auxiliary_loss_clip": 0.01191423, + "auxiliary_loss_mlp": 0.01182947, + "balance_loss_clip": 1.00255179, + "balance_loss_mlp": 1.00098586, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.9273967238092016, + "language_loss": 0.7808696, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80461335, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.5531442165374756 + }, + { + "auxiliary_loss_clip": 0.01191417, + "auxiliary_loss_mlp": 0.01183081, + "balance_loss_clip": 1.00238824, + "balance_loss_mlp": 1.000929, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 3.17779253306578, + "language_loss": 0.88408005, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90782499, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.6449403762817383 + }, + { + "auxiliary_loss_clip": 0.01191441, + "auxiliary_loss_mlp": 0.01183342, + "balance_loss_clip": 1.00248623, + "balance_loss_mlp": 1.00157154, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 2.2396887181346803, + "language_loss": 0.84635592, + "learning_rate": 3.997852438281901e-06, + "loss": 0.87010378, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.5380349159240723 + }, + { + "auxiliary_loss_clip": 0.01191591, + "auxiliary_loss_mlp": 0.0118301, + "balance_loss_clip": 1.00267029, + "balance_loss_mlp": 1.00104856, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.018016373807613, + "language_loss": 0.84783143, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87157738, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.643096446990967 + }, + { + "auxiliary_loss_clip": 0.01192329, + "auxiliary_loss_mlp": 0.01181869, + "balance_loss_clip": 1.0035857, + "balance_loss_mlp": 1.0002892, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8690616258421849, + "language_loss": 0.5912016, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61494362, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.1042354106903076 + }, + { + "auxiliary_loss_clip": 0.01191397, + "auxiliary_loss_mlp": 0.01182967, + "balance_loss_clip": 1.00251627, + "balance_loss_mlp": 1.00091076, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.321345541361132, + "language_loss": 0.91529757, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93904114, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.572227954864502 + }, + { + "auxiliary_loss_clip": 0.01191518, + "auxiliary_loss_mlp": 0.01183381, + "balance_loss_clip": 1.00257766, + "balance_loss_mlp": 1.00132453, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.142501865175342, + "language_loss": 0.71820796, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74195689, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.6359028816223145 + }, + { + "auxiliary_loss_clip": 0.01191229, + "auxiliary_loss_mlp": 0.0118304, + "balance_loss_clip": 1.00242996, + "balance_loss_mlp": 1.00126958, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.8438987914041103, + "language_loss": 0.8885808, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91232347, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 5.437734603881836 + }, + { + "auxiliary_loss_clip": 0.01191328, + "auxiliary_loss_mlp": 0.01182783, + "balance_loss_clip": 1.00238442, + "balance_loss_mlp": 1.00091767, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0874381251606766, + "language_loss": 0.84162545, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86536658, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.522620916366577 + }, + { + "auxiliary_loss_clip": 0.01191561, + "auxiliary_loss_mlp": 0.01182909, + "balance_loss_clip": 1.00265789, + "balance_loss_mlp": 1.00104344, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 2.620678527670064, + "language_loss": 0.79707778, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82082248, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 3.9571242332458496 + }, + { + "auxiliary_loss_clip": 0.01191323, + "auxiliary_loss_mlp": 0.01182832, + "balance_loss_clip": 1.00252175, + "balance_loss_mlp": 1.00106168, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.2962820346400417, + "language_loss": 0.85438281, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87812436, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.5075714588165283 + }, + { + "auxiliary_loss_clip": 0.01191198, + "auxiliary_loss_mlp": 0.01182708, + "balance_loss_clip": 1.00231552, + "balance_loss_mlp": 1.00084281, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 1.7520954637783237, + "language_loss": 0.69309342, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71683252, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.589272975921631 + }, + { + "auxiliary_loss_clip": 0.01191672, + "auxiliary_loss_mlp": 0.01183155, + "balance_loss_clip": 1.00278294, + "balance_loss_mlp": 1.00138438, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.240121921607948, + "language_loss": 0.68894309, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71269137, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.5682318210601807 + }, + { + "auxiliary_loss_clip": 0.01191409, + "auxiliary_loss_mlp": 0.01183097, + "balance_loss_clip": 1.00256038, + "balance_loss_mlp": 1.00123143, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9287894251645599, + "language_loss": 0.66628098, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69002604, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.4880571365356445 + }, + { + "auxiliary_loss_clip": 0.01191564, + "auxiliary_loss_mlp": 0.01183179, + "balance_loss_clip": 1.00262499, + "balance_loss_mlp": 1.00131357, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.590590861743196, + "language_loss": 0.76733184, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79107922, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.541956901550293 + }, + { + "auxiliary_loss_clip": 0.01191414, + "auxiliary_loss_mlp": 0.01183358, + "balance_loss_clip": 1.00251079, + "balance_loss_mlp": 1.00139689, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.731378676660631, + "language_loss": 0.88860929, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91235703, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.469550609588623 + }, + { + "auxiliary_loss_clip": 0.01191351, + "auxiliary_loss_mlp": 0.01183044, + "balance_loss_clip": 1.00245631, + "balance_loss_mlp": 1.00117803, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.0230541840550718, + "language_loss": 0.74693936, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77068329, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.5269479751586914 + }, + { + "auxiliary_loss_clip": 0.0119141, + "auxiliary_loss_mlp": 0.01182794, + "balance_loss_clip": 1.00251544, + "balance_loss_mlp": 1.00102365, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.0786352050182306, + "language_loss": 0.69446468, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71820676, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.476860523223877 + }, + { + "auxiliary_loss_clip": 0.01191419, + "auxiliary_loss_mlp": 0.01182591, + "balance_loss_clip": 1.00262511, + "balance_loss_mlp": 1.00072491, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 3.884580218665926, + "language_loss": 0.9240737, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94781381, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.5154995918273926 + }, + { + "auxiliary_loss_clip": 0.01191515, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_clip": 1.00266981, + "balance_loss_mlp": 1.00136578, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2038929813092354, + "language_loss": 0.91233665, + "learning_rate": 3.997534752096277e-06, + "loss": 0.9360851, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.510007381439209 + }, + { + "auxiliary_loss_clip": 0.01191328, + "auxiliary_loss_mlp": 0.01182849, + "balance_loss_clip": 1.00252831, + "balance_loss_mlp": 1.00107908, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.6057770526796404, + "language_loss": 0.78583968, + "learning_rate": 3.997515382918531e-06, + "loss": 0.8095814, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.5073201656341553 + }, + { + "auxiliary_loss_clip": 0.01191545, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_clip": 1.00261521, + "balance_loss_mlp": 1.00125778, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.0162730607145654, + "language_loss": 0.78674251, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81048822, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.5087928771972656 + }, + { + "auxiliary_loss_clip": 0.01192726, + "auxiliary_loss_mlp": 0.0118123, + "balance_loss_clip": 1.00388467, + "balance_loss_mlp": 1.00041294, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8061830811249726, + "language_loss": 0.62700915, + "learning_rate": 3.997476417325827e-06, + "loss": 0.65074867, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.1870906352996826 + }, + { + "auxiliary_loss_clip": 0.01191513, + "auxiliary_loss_mlp": 0.01183329, + "balance_loss_clip": 1.00263822, + "balance_loss_mlp": 1.0012728, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.960555135535176, + "language_loss": 0.841959, + "learning_rate": 3.997456820912346e-06, + "loss": 0.8657074, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.548642158508301 + }, + { + "auxiliary_loss_clip": 0.01191195, + "auxiliary_loss_mlp": 0.01182628, + "balance_loss_clip": 1.00245547, + "balance_loss_mlp": 1.00104856, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 4.5971022534762005, + "language_loss": 0.8825677, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90630591, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.5231475830078125 + }, + { + "auxiliary_loss_clip": 0.01191465, + "auxiliary_loss_mlp": 0.01182889, + "balance_loss_clip": 1.00254047, + "balance_loss_mlp": 1.00102353, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.2679863721392723, + "language_loss": 0.73343289, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75717646, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.535742998123169 + }, + { + "auxiliary_loss_clip": 0.01191399, + "auxiliary_loss_mlp": 0.01183031, + "balance_loss_clip": 1.00256431, + "balance_loss_mlp": 1.00145149, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.3406511612541445, + "language_loss": 0.82284057, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84658486, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.510134220123291 + }, + { + "auxiliary_loss_clip": 0.01191415, + "auxiliary_loss_mlp": 0.01182781, + "balance_loss_clip": 1.00262976, + "balance_loss_mlp": 1.00110579, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7936555247813097, + "language_loss": 0.79735816, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82110012, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.5488479137420654 + }, + { + "auxiliary_loss_clip": 0.01192826, + "auxiliary_loss_mlp": 0.01181232, + "balance_loss_clip": 1.0039283, + "balance_loss_mlp": 1.00041568, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.0057283498073082, + "language_loss": 0.58681273, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61055332, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.1371893882751465 + }, + { + "auxiliary_loss_clip": 0.01191299, + "auxiliary_loss_mlp": 0.01182907, + "balance_loss_clip": 1.00252104, + "balance_loss_mlp": 1.00123191, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8662117822943016, + "language_loss": 0.87539291, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.89913499, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.517627716064453 + }, + { + "auxiliary_loss_clip": 0.01191293, + "auxiliary_loss_mlp": 0.01183027, + "balance_loss_clip": 1.00261402, + "balance_loss_mlp": 1.00135195, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 2.866875567069742, + "language_loss": 0.86232972, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88607287, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.583584785461426 + }, + { + "auxiliary_loss_clip": 0.01191471, + "auxiliary_loss_mlp": 0.01182701, + "balance_loss_clip": 1.00268006, + "balance_loss_mlp": 1.00102556, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.555802651808535, + "language_loss": 0.87746429, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90120596, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.5126051902770996 + }, + { + "auxiliary_loss_clip": 0.01191349, + "auxiliary_loss_mlp": 0.01182885, + "balance_loss_clip": 1.00260305, + "balance_loss_mlp": 1.00111485, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.512828807741115, + "language_loss": 0.84321153, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86695385, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.5057315826416016 + }, + { + "auxiliary_loss_clip": 0.01191411, + "auxiliary_loss_mlp": 0.01182451, + "balance_loss_clip": 1.0027051, + "balance_loss_mlp": 1.00087106, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 3.251875075793795, + "language_loss": 0.86895293, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89269161, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.5724098682403564 + }, + { + "auxiliary_loss_clip": 0.01191234, + "auxiliary_loss_mlp": 0.0118268, + "balance_loss_clip": 1.00247502, + "balance_loss_mlp": 1.00110066, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.4399826790259236, + "language_loss": 0.75168562, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77542472, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.5577659606933594 + }, + { + "auxiliary_loss_clip": 0.01191298, + "auxiliary_loss_mlp": 0.01182528, + "balance_loss_clip": 1.00264192, + "balance_loss_mlp": 1.00104403, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 2.156649133127914, + "language_loss": 0.86228395, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88602227, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.497699737548828 + }, + { + "auxiliary_loss_clip": 0.01191274, + "auxiliary_loss_mlp": 0.0118246, + "balance_loss_clip": 1.00252259, + "balance_loss_mlp": 1.00097537, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 3.071645034613792, + "language_loss": 0.87153494, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89527225, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.5090160369873047 + }, + { + "auxiliary_loss_clip": 0.01191249, + "auxiliary_loss_mlp": 0.01182843, + "balance_loss_clip": 1.00254011, + "balance_loss_mlp": 1.00135899, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.5030592451316913, + "language_loss": 0.8378548, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86159575, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.523542881011963 + }, + { + "auxiliary_loss_clip": 0.01191291, + "auxiliary_loss_mlp": 0.01182633, + "balance_loss_clip": 1.00262237, + "balance_loss_mlp": 1.00105309, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.004663328858886, + "language_loss": 0.73769546, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76143467, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.5789802074432373 + }, + { + "auxiliary_loss_clip": 0.01191261, + "auxiliary_loss_mlp": 0.01182438, + "balance_loss_clip": 1.00258565, + "balance_loss_mlp": 1.00114405, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.3946537462653303, + "language_loss": 0.78502309, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80876011, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.543104887008667 + }, + { + "auxiliary_loss_clip": 0.01191378, + "auxiliary_loss_mlp": 0.01182474, + "balance_loss_clip": 1.00265074, + "balance_loss_mlp": 1.00098932, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.180368952160145, + "language_loss": 0.73495841, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75869691, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.612600564956665 + }, + { + "auxiliary_loss_clip": 0.01191209, + "auxiliary_loss_mlp": 0.01182343, + "balance_loss_clip": 1.0024879, + "balance_loss_mlp": 1.00076365, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 2.045826322103043, + "language_loss": 0.77301526, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79675078, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.5300230979919434 + }, + { + "auxiliary_loss_clip": 0.01191226, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_clip": 1.00252366, + "balance_loss_mlp": 1.00105071, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 3.653813311759459, + "language_loss": 0.71621364, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73995125, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.573453426361084 + }, + { + "auxiliary_loss_clip": 0.01191187, + "auxiliary_loss_mlp": 0.01182768, + "balance_loss_clip": 1.00258148, + "balance_loss_mlp": 1.00128353, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 2.445224330967536, + "language_loss": 0.76862442, + "learning_rate": 3.997048987461856e-06, + "loss": 0.792364, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.6191720962524414 + }, + { + "auxiliary_loss_clip": 0.01191225, + "auxiliary_loss_mlp": 0.01182226, + "balance_loss_clip": 1.00257397, + "balance_loss_mlp": 1.00102818, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.064442930404506, + "language_loss": 0.79098189, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81471634, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.4978907108306885 + }, + { + "auxiliary_loss_clip": 0.01191012, + "auxiliary_loss_mlp": 0.01182319, + "balance_loss_clip": 1.00243568, + "balance_loss_mlp": 1.00121641, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.8795732302163435, + "language_loss": 0.77230585, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7960391, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 4.033670663833618 + }, + { + "auxiliary_loss_clip": 0.01191251, + "auxiliary_loss_mlp": 0.01182389, + "balance_loss_clip": 1.00265872, + "balance_loss_mlp": 1.00109601, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.3666975892427993, + "language_loss": 0.76495242, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78868884, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 4.013776540756226 + }, + { + "auxiliary_loss_clip": 0.0119136, + "auxiliary_loss_mlp": 0.01182705, + "balance_loss_clip": 1.00267148, + "balance_loss_mlp": 1.00122094, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.4465202071981653, + "language_loss": 0.74167907, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76541972, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 3.986279249191284 + }, + { + "auxiliary_loss_clip": 0.01191141, + "auxiliary_loss_mlp": 0.01182345, + "balance_loss_clip": 1.00255013, + "balance_loss_mlp": 1.0010519, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.1063038446263267, + "language_loss": 0.80240786, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82614273, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 4.002565622329712 + }, + { + "auxiliary_loss_clip": 0.01191048, + "auxiliary_loss_mlp": 0.01182106, + "balance_loss_clip": 1.00254536, + "balance_loss_mlp": 1.0009079, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.288356703069817, + "language_loss": 0.81306672, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83679819, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.517749547958374 + }, + { + "auxiliary_loss_clip": 0.01190997, + "auxiliary_loss_mlp": 0.01182563, + "balance_loss_clip": 1.00240266, + "balance_loss_mlp": 1.00126958, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 3.015691895835471, + "language_loss": 0.8100695, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83380508, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.492347478866577 + }, + { + "auxiliary_loss_clip": 0.011913, + "auxiliary_loss_mlp": 0.01182472, + "balance_loss_clip": 1.00269365, + "balance_loss_mlp": 1.00117826, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.0176474716004127, + "language_loss": 0.89309204, + "learning_rate": 3.996877372161152e-06, + "loss": 0.91682982, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.469708204269409 + }, + { + "auxiliary_loss_clip": 0.01191099, + "auxiliary_loss_mlp": 0.01182248, + "balance_loss_clip": 1.00236559, + "balance_loss_mlp": 1.00095415, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 2.376358816631175, + "language_loss": 0.76735353, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.79108709, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.467643976211548 + }, + { + "auxiliary_loss_clip": 0.01191252, + "auxiliary_loss_mlp": 0.01182298, + "balance_loss_clip": 1.00276566, + "balance_loss_mlp": 1.0011003, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 3.665838412748605, + "language_loss": 0.81091416, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83464968, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.512218475341797 + }, + { + "auxiliary_loss_clip": 0.01191116, + "auxiliary_loss_mlp": 0.01181988, + "balance_loss_clip": 1.00257611, + "balance_loss_mlp": 1.00098109, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 1.8837582844409437, + "language_loss": 0.84718788, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87091893, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.5122673511505127 + }, + { + "auxiliary_loss_clip": 0.01191188, + "auxiliary_loss_mlp": 0.0118213, + "balance_loss_clip": 1.00269604, + "balance_loss_mlp": 1.0012176, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 3.8698425907844585, + "language_loss": 0.82038283, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84411597, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.49934983253479 + }, + { + "auxiliary_loss_clip": 0.01190936, + "auxiliary_loss_mlp": 0.01182158, + "balance_loss_clip": 1.00239623, + "balance_loss_mlp": 1.00095987, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 3.149355963510558, + "language_loss": 0.88137937, + "learning_rate": 3.996767651613597e-06, + "loss": 0.9051103, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.6643147468566895 + }, + { + "auxiliary_loss_clip": 0.01190994, + "auxiliary_loss_mlp": 0.0118251, + "balance_loss_clip": 1.00252295, + "balance_loss_mlp": 1.00131154, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.202832829107014, + "language_loss": 0.90571046, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92944551, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.48573637008667 + }, + { + "auxiliary_loss_clip": 0.01191059, + "auxiliary_loss_mlp": 0.01182134, + "balance_loss_clip": 1.00257063, + "balance_loss_mlp": 1.00131786, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 4.513917402430457, + "language_loss": 0.73529106, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75902301, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.4907708168029785 + }, + { + "auxiliary_loss_clip": 0.01191245, + "auxiliary_loss_mlp": 0.0118218, + "balance_loss_clip": 1.00260699, + "balance_loss_mlp": 1.00098157, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 1.825606413996748, + "language_loss": 0.86417031, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88790458, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.5392606258392334 + }, + { + "auxiliary_loss_clip": 0.01190979, + "auxiliary_loss_mlp": 0.01182713, + "balance_loss_clip": 1.0024488, + "balance_loss_mlp": 1.00132406, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.0359290645408814, + "language_loss": 0.70038772, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72412467, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.550081491470337 + }, + { + "auxiliary_loss_clip": 0.0119099, + "auxiliary_loss_mlp": 0.01182175, + "balance_loss_clip": 1.00252426, + "balance_loss_mlp": 1.00126338, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.82311803341814, + "language_loss": 0.8072387, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83097041, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.5443451404571533 + }, + { + "auxiliary_loss_clip": 0.01191059, + "auxiliary_loss_mlp": 0.01182074, + "balance_loss_clip": 1.00259686, + "balance_loss_mlp": 1.00116229, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.7168319844681883, + "language_loss": 0.81510687, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83883822, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.480043649673462 + }, + { + "auxiliary_loss_clip": 0.01192626, + "auxiliary_loss_mlp": 0.01180564, + "balance_loss_clip": 1.00425076, + "balance_loss_mlp": 1.00051045, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 1.0348938106667356, + "language_loss": 0.64454246, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66827434, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.011265516281128 + }, + { + "auxiliary_loss_clip": 0.0119095, + "auxiliary_loss_mlp": 0.01182033, + "balance_loss_clip": 1.00247216, + "balance_loss_mlp": 1.00112104, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 2.1119193101492404, + "language_loss": 0.90961945, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93334931, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.5090324878692627 + }, + { + "auxiliary_loss_clip": 0.0119127, + "auxiliary_loss_mlp": 0.01182093, + "balance_loss_clip": 1.0027616, + "balance_loss_mlp": 1.00118136, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 3.950658495520385, + "language_loss": 0.86791295, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89164662, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.5356342792510986 + }, + { + "auxiliary_loss_clip": 0.01191198, + "auxiliary_loss_mlp": 0.01182198, + "balance_loss_clip": 1.0026598, + "balance_loss_mlp": 1.00119066, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 4.625031764642235, + "language_loss": 0.84157467, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86530864, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.529557466506958 + }, + { + "auxiliary_loss_clip": 0.01191106, + "auxiliary_loss_mlp": 0.01182545, + "balance_loss_clip": 1.00264072, + "balance_loss_mlp": 1.0013473, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 2.325014320177034, + "language_loss": 0.79924995, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82298648, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.7103476524353027 + }, + { + "auxiliary_loss_clip": 0.01191115, + "auxiliary_loss_mlp": 0.01181896, + "balance_loss_clip": 1.00267828, + "balance_loss_mlp": 1.00107932, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.19726517222841, + "language_loss": 0.86449409, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88822424, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.484571695327759 + }, + { + "auxiliary_loss_clip": 0.0119115, + "auxiliary_loss_mlp": 0.01182057, + "balance_loss_clip": 1.00268483, + "balance_loss_mlp": 1.00095415, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.6408208214354114, + "language_loss": 0.85481954, + "learning_rate": 3.996473519492753e-06, + "loss": 0.8785516, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.5262176990509033 + }, + { + "auxiliary_loss_clip": 0.01191036, + "auxiliary_loss_mlp": 0.0118188, + "balance_loss_clip": 1.00266027, + "balance_loss_mlp": 1.00115836, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.376417325904921, + "language_loss": 0.86670673, + "learning_rate": 3.99645036397538e-06, + "loss": 0.89043587, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.5368878841400146 + }, + { + "auxiliary_loss_clip": 0.01190933, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_clip": 1.0024991, + "balance_loss_mlp": 1.00117159, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 2.0324876391692226, + "language_loss": 0.68196821, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70569927, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.555755376815796 + }, + { + "auxiliary_loss_clip": 0.0119083, + "auxiliary_loss_mlp": 0.01181798, + "balance_loss_clip": 1.00247598, + "balance_loss_mlp": 1.00098169, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.056224490337765, + "language_loss": 0.76747751, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79120374, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.58254337310791 + }, + { + "auxiliary_loss_clip": 0.01190828, + "auxiliary_loss_mlp": 0.01182446, + "balance_loss_clip": 1.00249505, + "balance_loss_mlp": 1.00153446, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 3.117501003300005, + "language_loss": 0.86585331, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88958597, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.484604835510254 + }, + { + "auxiliary_loss_clip": 0.01190987, + "auxiliary_loss_mlp": 0.01181966, + "balance_loss_clip": 1.0025202, + "balance_loss_mlp": 1.00114942, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.250009165799722, + "language_loss": 0.89725029, + "learning_rate": 3.996356984858732e-06, + "loss": 0.9209798, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.50563383102417 + }, + { + "auxiliary_loss_clip": 0.01191034, + "auxiliary_loss_mlp": 0.01182149, + "balance_loss_clip": 1.00266743, + "balance_loss_mlp": 1.00114202, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.09599097259239, + "language_loss": 0.84912407, + "learning_rate": 3.996333450822208e-06, + "loss": 0.8728559, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.5282135009765625 + }, + { + "auxiliary_loss_clip": 0.01191117, + "auxiliary_loss_mlp": 0.01182112, + "balance_loss_clip": 1.00255287, + "balance_loss_mlp": 1.00129509, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 2.1022307000345, + "language_loss": 0.8073591, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8310914, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.5124831199645996 + }, + { + "auxiliary_loss_clip": 0.01190983, + "auxiliary_loss_mlp": 0.01181958, + "balance_loss_clip": 1.00254726, + "balance_loss_mlp": 1.00133181, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 3.1673364385179665, + "language_loss": 0.74617028, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76989973, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.5061795711517334 + }, + { + "auxiliary_loss_clip": 0.0119097, + "auxiliary_loss_mlp": 0.01182034, + "balance_loss_clip": 1.00265503, + "balance_loss_mlp": 1.00131297, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.993064256826785, + "language_loss": 0.90156329, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92529327, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.5158939361572266 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01181789, + "balance_loss_clip": 1.00252843, + "balance_loss_mlp": 1.00106788, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 8.01629552886345, + "language_loss": 0.74733192, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77105904, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.536252737045288 + }, + { + "auxiliary_loss_clip": 0.01190871, + "auxiliary_loss_mlp": 0.01181918, + "balance_loss_clip": 1.0024848, + "balance_loss_mlp": 1.00119662, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.6381757255295453, + "language_loss": 0.83423197, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85795987, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.5367305278778076 + }, + { + "auxiliary_loss_clip": 0.01190978, + "auxiliary_loss_mlp": 0.01182278, + "balance_loss_clip": 1.00256038, + "balance_loss_mlp": 1.00108027, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.4580570663966834, + "language_loss": 0.91270065, + "learning_rate": 3.996190656910043e-06, + "loss": 0.9364332, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.0119108, + "auxiliary_loss_mlp": 0.01181831, + "balance_loss_clip": 1.00263393, + "balance_loss_mlp": 1.00091863, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.7964935838039002, + "language_loss": 0.80157787, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82530695, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.5202090740203857 + }, + { + "auxiliary_loss_clip": 0.01191107, + "auxiliary_loss_mlp": 0.01182679, + "balance_loss_clip": 1.00266814, + "balance_loss_mlp": 1.00157678, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 3.6995673423625464, + "language_loss": 0.85018444, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87392229, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 3.954771041870117 + }, + { + "auxiliary_loss_clip": 0.01191161, + "auxiliary_loss_mlp": 0.01182087, + "balance_loss_clip": 1.00268817, + "balance_loss_mlp": 1.00127041, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.0936042915342132, + "language_loss": 0.75731587, + "learning_rate": 3.996118238049124e-06, + "loss": 0.78104836, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 4.0758843421936035 + }, + { + "auxiliary_loss_clip": 0.01191203, + "auxiliary_loss_mlp": 0.01182293, + "balance_loss_clip": 1.00281394, + "balance_loss_mlp": 1.00147653, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.6132039389341415, + "language_loss": 0.84771204, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87144697, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.4850947856903076 + }, + { + "auxiliary_loss_clip": 0.01191046, + "auxiliary_loss_mlp": 0.01181938, + "balance_loss_clip": 1.00258195, + "balance_loss_mlp": 1.00121665, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 2.1546560168966122, + "language_loss": 0.90817696, + "learning_rate": 3.996069580341966e-06, + "loss": 0.93190682, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 5.3999927043914795 + }, + { + "auxiliary_loss_clip": 0.01190917, + "auxiliary_loss_mlp": 0.01182179, + "balance_loss_clip": 1.002545, + "balance_loss_mlp": 1.00155342, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.991865534915194, + "language_loss": 0.89408636, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91781735, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.547790050506592 + }, + { + "auxiliary_loss_clip": 0.01190966, + "auxiliary_loss_mlp": 0.01182022, + "balance_loss_clip": 1.00258875, + "balance_loss_mlp": 1.00120544, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.2631300127132072, + "language_loss": 0.67373562, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69746548, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.5621554851531982 + }, + { + "auxiliary_loss_clip": 0.01192762, + "auxiliary_loss_mlp": 0.0118123, + "balance_loss_clip": 1.00442863, + "balance_loss_mlp": 1.00117636, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3346735140129076, + "language_loss": 0.62254709, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64628702, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.1971945762634277 + }, + { + "auxiliary_loss_clip": 0.01191052, + "auxiliary_loss_mlp": 0.0118215, + "balance_loss_clip": 1.00261664, + "balance_loss_mlp": 1.001333, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.590332710044581, + "language_loss": 0.90571368, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92944568, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.630307197570801 + }, + { + "auxiliary_loss_clip": 0.01191078, + "auxiliary_loss_mlp": 0.01181906, + "balance_loss_clip": 1.0026691, + "balance_loss_mlp": 1.00099432, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 3.4551436244154767, + "language_loss": 0.66738343, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69111329, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.4964241981506348 + }, + { + "auxiliary_loss_clip": 0.01191093, + "auxiliary_loss_mlp": 0.01182202, + "balance_loss_clip": 1.00265789, + "balance_loss_mlp": 1.00119507, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 1.7413677329109574, + "language_loss": 0.78296125, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80669427, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.5213217735290527 + }, + { + "auxiliary_loss_clip": 0.01190988, + "auxiliary_loss_mlp": 0.01182171, + "balance_loss_clip": 1.00251734, + "balance_loss_mlp": 1.00135446, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 3.2697539267426063, + "language_loss": 0.78523278, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80896437, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.6614902019500732 + }, + { + "auxiliary_loss_clip": 0.01190966, + "auxiliary_loss_mlp": 0.01181497, + "balance_loss_clip": 1.00256717, + "balance_loss_mlp": 1.00096655, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 1.8094924690677736, + "language_loss": 0.83782732, + "learning_rate": 3.995871921941519e-06, + "loss": 0.86155188, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.5643179416656494 + }, + { + "auxiliary_loss_clip": 0.01191085, + "auxiliary_loss_mlp": 0.01181954, + "balance_loss_clip": 1.00265098, + "balance_loss_mlp": 1.00132799, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.831444775794932, + "language_loss": 0.75336325, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77709365, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.4957683086395264 + }, + { + "auxiliary_loss_clip": 0.01190938, + "auxiliary_loss_mlp": 0.01181984, + "balance_loss_clip": 1.00253463, + "balance_loss_mlp": 1.00126266, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.8039842221842433, + "language_loss": 0.7926237, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81635296, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.4847378730773926 + }, + { + "auxiliary_loss_clip": 0.01125496, + "auxiliary_loss_mlp": 0.01181657, + "balance_loss_clip": 1.00241137, + "balance_loss_mlp": 1.00112677, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.635458501141213, + "language_loss": 0.91714048, + "learning_rate": 3.995796551235016e-06, + "loss": 0.94021201, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.6835482120513916 + }, + { + "auxiliary_loss_clip": 0.01157926, + "auxiliary_loss_mlp": 0.01181996, + "balance_loss_clip": 1.00249648, + "balance_loss_mlp": 1.00127459, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9932634260205033, + "language_loss": 0.83374548, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85714471, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.844416618347168 + }, + { + "auxiliary_loss_clip": 0.01157587, + "auxiliary_loss_mlp": 0.01181393, + "balance_loss_clip": 1.00229049, + "balance_loss_mlp": 1.00095749, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.2923295642960633, + "language_loss": 0.81721896, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84060872, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.723414659500122 + }, + { + "auxiliary_loss_clip": 0.01174184, + "auxiliary_loss_mlp": 0.01181957, + "balance_loss_clip": 1.00240588, + "balance_loss_mlp": 1.00104463, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.857298834616751, + "language_loss": 0.91931736, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94287878, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.5756049156188965 + }, + { + "auxiliary_loss_clip": 0.01190728, + "auxiliary_loss_mlp": 0.01182387, + "balance_loss_clip": 1.00241137, + "balance_loss_mlp": 1.00166535, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.109234571262952, + "language_loss": 0.76498455, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78871566, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.50311017036438 + }, + { + "auxiliary_loss_clip": 0.01174323, + "auxiliary_loss_mlp": 0.01181816, + "balance_loss_clip": 1.00246596, + "balance_loss_mlp": 1.00128555, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.187807878841658, + "language_loss": 0.83810902, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.86167037, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.57853627204895 + }, + { + "auxiliary_loss_clip": 0.01157825, + "auxiliary_loss_mlp": 0.00749762, + "balance_loss_clip": 1.00244677, + "balance_loss_mlp": 1.00070977, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.0383183010596992, + "language_loss": 0.72868598, + "learning_rate": 3.995643766466275e-06, + "loss": 0.74776185, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.570479154586792 + }, + { + "auxiliary_loss_clip": 0.01158217, + "auxiliary_loss_mlp": 0.01182038, + "balance_loss_clip": 1.00238299, + "balance_loss_mlp": 1.00131631, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.7083599947538113, + "language_loss": 0.8323192, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85572177, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.632970094680786 + }, + { + "auxiliary_loss_clip": 0.01190709, + "auxiliary_loss_mlp": 0.01181808, + "balance_loss_clip": 1.00251508, + "balance_loss_mlp": 1.00146842, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.3239597576230877, + "language_loss": 0.85626841, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87999362, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.535155773162842 + }, + { + "auxiliary_loss_clip": 0.01158027, + "auxiliary_loss_mlp": 0.0118197, + "balance_loss_clip": 1.00238383, + "balance_loss_mlp": 1.00134373, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.072608838819178, + "language_loss": 0.94360918, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96700907, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.63588547706604 + }, + { + "auxiliary_loss_clip": 0.01190857, + "auxiliary_loss_mlp": 0.01181961, + "balance_loss_clip": 1.00264239, + "balance_loss_mlp": 1.00162148, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.5181141629040416, + "language_loss": 0.77479064, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79851884, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.5874476432800293 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01182428, + "balance_loss_clip": 1.00265217, + "balance_loss_mlp": 1.00142097, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 3.9109507982594747, + "language_loss": 0.78444886, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80801839, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.5442028045654297 + }, + { + "auxiliary_loss_clip": 0.01174268, + "auxiliary_loss_mlp": 0.01181801, + "balance_loss_clip": 1.00248706, + "balance_loss_mlp": 1.00117528, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.2363981837775238, + "language_loss": 0.82663465, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85019535, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.6798453330993652 + }, + { + "auxiliary_loss_clip": 0.01174397, + "auxiliary_loss_mlp": 0.01181647, + "balance_loss_clip": 1.00235844, + "balance_loss_mlp": 1.00111616, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.0376842338167545, + "language_loss": 0.76599109, + "learning_rate": 3.995462074371614e-06, + "loss": 0.7895515, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.635495662689209 + }, + { + "auxiliary_loss_clip": 0.01174039, + "auxiliary_loss_mlp": 0.01181592, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00125217, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.8932796367488827, + "language_loss": 0.87893355, + "learning_rate": 3.99543581567769e-06, + "loss": 0.9024899, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.6235432624816895 + }, + { + "auxiliary_loss_clip": 0.01158361, + "auxiliary_loss_mlp": 0.01181825, + "balance_loss_clip": 1.00245166, + "balance_loss_mlp": 1.0010078, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.8844753573757467, + "language_loss": 0.87704051, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.90044236, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.61336612701416 + }, + { + "auxiliary_loss_clip": 0.0112468, + "auxiliary_loss_mlp": 0.0118175, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00131476, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.4461967911634326, + "language_loss": 0.81961071, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84267509, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.676593542098999 + }, + { + "auxiliary_loss_clip": 0.0119086, + "auxiliary_loss_mlp": 0.01182072, + "balance_loss_clip": 1.00262427, + "balance_loss_mlp": 1.0015415, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.7588334902363671, + "language_loss": 0.87053096, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89426029, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.622781276702881 + }, + { + "auxiliary_loss_clip": 0.01190726, + "auxiliary_loss_mlp": 0.01181221, + "balance_loss_clip": 1.00251317, + "balance_loss_mlp": 1.00088155, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 2.2679461517208748, + "language_loss": 0.83508313, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85880268, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.5655715465545654 + }, + { + "auxiliary_loss_clip": 0.01173966, + "auxiliary_loss_mlp": 0.01181687, + "balance_loss_clip": 1.00233173, + "balance_loss_mlp": 1.00096595, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.332224185185384, + "language_loss": 0.65436387, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67792034, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.705850839614868 + }, + { + "auxiliary_loss_clip": 0.01174182, + "auxiliary_loss_mlp": 0.01181897, + "balance_loss_clip": 1.00245035, + "balance_loss_mlp": 1.00136626, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 3.058063618926617, + "language_loss": 0.83354408, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85710478, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.5653038024902344 + }, + { + "auxiliary_loss_clip": 0.01157471, + "auxiliary_loss_mlp": 0.01181868, + "balance_loss_clip": 1.00219381, + "balance_loss_mlp": 1.00124168, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 2.045524698438203, + "language_loss": 0.80462492, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82801831, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.581315279006958 + }, + { + "auxiliary_loss_clip": 0.01190781, + "auxiliary_loss_mlp": 0.01181899, + "balance_loss_clip": 1.00255895, + "balance_loss_mlp": 1.00155914, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 2.195821356744991, + "language_loss": 0.75772631, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78145313, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.5762224197387695 + }, + { + "auxiliary_loss_clip": 0.01157698, + "auxiliary_loss_mlp": 0.01181746, + "balance_loss_clip": 1.00232327, + "balance_loss_mlp": 1.00131106, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.08607898746834, + "language_loss": 0.81597602, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83937049, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.6962738037109375 + }, + { + "auxiliary_loss_clip": 0.01143147, + "auxiliary_loss_mlp": 0.00750097, + "balance_loss_clip": 1.00379205, + "balance_loss_mlp": 1.00126326, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9915847093209789, + "language_loss": 0.65675855, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67569101, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 3.2140605449676514 + }, + { + "auxiliary_loss_clip": 0.01157674, + "auxiliary_loss_mlp": 0.01181332, + "balance_loss_clip": 1.00225639, + "balance_loss_mlp": 1.00108731, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.9134093763499176, + "language_loss": 0.77008921, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79347926, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 4.008908748626709 + }, + { + "auxiliary_loss_clip": 0.01141285, + "auxiliary_loss_mlp": 0.01181831, + "balance_loss_clip": 1.00222039, + "balance_loss_mlp": 1.00120497, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.4578763419768994, + "language_loss": 0.88875782, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91198897, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 4.072934150695801 + }, + { + "auxiliary_loss_clip": 0.01157712, + "auxiliary_loss_mlp": 0.01181456, + "balance_loss_clip": 1.00237894, + "balance_loss_mlp": 1.00102127, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.961282116969088, + "language_loss": 0.75484186, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77823359, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 4.121927738189697 + }, + { + "auxiliary_loss_clip": 0.01158156, + "auxiliary_loss_mlp": 0.01182007, + "balance_loss_clip": 1.00233042, + "balance_loss_mlp": 1.00138068, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 3.864438851656399, + "language_loss": 0.9096244, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93302608, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.545660972595215 + }, + { + "auxiliary_loss_clip": 0.0119073, + "auxiliary_loss_mlp": 0.01181719, + "balance_loss_clip": 1.00255942, + "balance_loss_mlp": 1.00128341, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 2.291747470990627, + "language_loss": 0.82541341, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8491379, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.563305139541626 + }, + { + "auxiliary_loss_clip": 0.01157421, + "auxiliary_loss_mlp": 0.01181557, + "balance_loss_clip": 1.00220275, + "balance_loss_mlp": 1.00112176, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.8213887397380888, + "language_loss": 0.7859239, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80931371, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.6967101097106934 + }, + { + "auxiliary_loss_clip": 0.011582, + "auxiliary_loss_mlp": 0.01181534, + "balance_loss_clip": 1.00252986, + "balance_loss_mlp": 1.00128961, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9008614111619067, + "language_loss": 0.88796145, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91135883, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.608851909637451 + }, + { + "auxiliary_loss_clip": 0.01157624, + "auxiliary_loss_mlp": 0.01181614, + "balance_loss_clip": 1.00243199, + "balance_loss_mlp": 1.00127387, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.2938890220001915, + "language_loss": 0.76036572, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78375816, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.6687846183776855 + }, + { + "auxiliary_loss_clip": 0.01158063, + "auxiliary_loss_mlp": 0.0118192, + "balance_loss_clip": 1.00246119, + "balance_loss_mlp": 1.00119925, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 2.2710618744111972, + "language_loss": 0.78981203, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81321192, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.5951056480407715 + }, + { + "auxiliary_loss_clip": 0.01173874, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.00132489, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.120896776988591, + "language_loss": 0.86047602, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88403141, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.5477776527404785 + }, + { + "auxiliary_loss_clip": 0.01141007, + "auxiliary_loss_mlp": 0.01181327, + "balance_loss_clip": 1.00235677, + "balance_loss_mlp": 1.00136876, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 3.0832621448745847, + "language_loss": 0.87401462, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.8972379, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.6024017333984375 + }, + { + "auxiliary_loss_clip": 0.0114119, + "auxiliary_loss_mlp": 0.01181652, + "balance_loss_clip": 1.00226808, + "balance_loss_mlp": 1.00150323, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.8370657305766918, + "language_loss": 0.63481873, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65804714, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.7452738285064697 + }, + { + "auxiliary_loss_clip": 0.0119058, + "auxiliary_loss_mlp": 0.01181425, + "balance_loss_clip": 1.00247455, + "balance_loss_mlp": 1.00118029, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 1.907604308915696, + "language_loss": 0.83393967, + "learning_rate": 3.994810983642281e-06, + "loss": 0.8576597, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.5243606567382812 + }, + { + "auxiliary_loss_clip": 0.01174007, + "auxiliary_loss_mlp": 0.01181461, + "balance_loss_clip": 1.00243104, + "balance_loss_mlp": 1.00102592, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 12.985518389223786, + "language_loss": 0.8763473, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89990199, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.5470101833343506 + }, + { + "auxiliary_loss_clip": 0.01190598, + "auxiliary_loss_mlp": 0.01181711, + "balance_loss_clip": 1.00251079, + "balance_loss_mlp": 1.00127542, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 3.440381091821534, + "language_loss": 0.80788392, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83160698, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.5420644283294678 + }, + { + "auxiliary_loss_clip": 0.01157401, + "auxiliary_loss_mlp": 0.01181414, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.00126445, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 2.319770911816168, + "language_loss": 0.81243479, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83582294, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.607651710510254 + }, + { + "auxiliary_loss_clip": 0.01158199, + "auxiliary_loss_mlp": 0.01177804, + "balance_loss_clip": 1.00324535, + "balance_loss_mlp": 1.0000391, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8801337675143011, + "language_loss": 0.6160928, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63945293, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.110746145248413 + }, + { + "auxiliary_loss_clip": 0.0115771, + "auxiliary_loss_mlp": 0.01181465, + "balance_loss_clip": 1.00230646, + "balance_loss_mlp": 1.00112545, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 2.022273531050014, + "language_loss": 0.89231819, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91570997, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.6616110801696777 + }, + { + "auxiliary_loss_clip": 0.01157273, + "auxiliary_loss_mlp": 0.01181576, + "balance_loss_clip": 1.00209415, + "balance_loss_mlp": 1.00142705, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.8453497799031788, + "language_loss": 0.74595785, + "learning_rate": 3.994641402486977e-06, + "loss": 0.7693463, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.709290027618408 + }, + { + "auxiliary_loss_clip": 0.01173861, + "auxiliary_loss_mlp": 0.01180953, + "balance_loss_clip": 1.00225627, + "balance_loss_mlp": 1.0008992, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.8163884003703408, + "language_loss": 0.92889619, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95244431, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.652411699295044 + }, + { + "auxiliary_loss_clip": 0.01192226, + "auxiliary_loss_mlp": 0.01181248, + "balance_loss_clip": 1.00417519, + "balance_loss_mlp": 1.00271976, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8322664710753068, + "language_loss": 0.62936318, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65309787, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.1202142238616943 + }, + { + "auxiliary_loss_clip": 0.01157253, + "auxiliary_loss_mlp": 0.01181224, + "balance_loss_clip": 1.00221109, + "balance_loss_mlp": 1.0011704, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.875901438193497, + "language_loss": 0.85728443, + "learning_rate": 3.994555590795299e-06, + "loss": 0.88066924, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.5779221057891846 + }, + { + "auxiliary_loss_clip": 0.01190592, + "auxiliary_loss_mlp": 0.01180935, + "balance_loss_clip": 1.00258422, + "balance_loss_mlp": 1.00107241, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.053207277179106, + "language_loss": 0.83031487, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85403013, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.5734634399414062 + }, + { + "auxiliary_loss_clip": 0.01157209, + "auxiliary_loss_mlp": 0.01181749, + "balance_loss_clip": 1.00215399, + "balance_loss_mlp": 1.00150514, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.186336536124984, + "language_loss": 0.84262472, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86601424, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.572476625442505 + }, + { + "auxiliary_loss_clip": 0.01141167, + "auxiliary_loss_mlp": 0.01181512, + "balance_loss_clip": 1.00240815, + "balance_loss_mlp": 1.00145853, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.2022505004562682, + "language_loss": 0.87298167, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89620841, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.6418848037719727 + }, + { + "auxiliary_loss_clip": 0.01173882, + "auxiliary_loss_mlp": 0.0118148, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00133133, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.961004650108553, + "language_loss": 0.87769967, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90125322, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.571784496307373 + }, + { + "auxiliary_loss_clip": 0.01190415, + "auxiliary_loss_mlp": 0.01181489, + "balance_loss_clip": 1.00243187, + "balance_loss_mlp": 1.00133979, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.550143884401855, + "language_loss": 0.69184947, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71556848, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.634324550628662 + }, + { + "auxiliary_loss_clip": 0.0114118, + "auxiliary_loss_mlp": 0.01181137, + "balance_loss_clip": 1.00238335, + "balance_loss_mlp": 1.00098753, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 3.4063787727434685, + "language_loss": 0.76337957, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78660274, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.6728904247283936 + }, + { + "auxiliary_loss_clip": 0.01124373, + "auxiliary_loss_mlp": 0.01181089, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00122607, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.331226038796127, + "language_loss": 0.861076, + "learning_rate": 3.994352716384659e-06, + "loss": 0.8841306, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.665635108947754 + }, + { + "auxiliary_loss_clip": 0.01157924, + "auxiliary_loss_mlp": 0.01181269, + "balance_loss_clip": 1.00235271, + "balance_loss_mlp": 1.00140631, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6834705888542865, + "language_loss": 0.85876745, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88215935, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.7020668983459473 + }, + { + "auxiliary_loss_clip": 0.01140851, + "auxiliary_loss_mlp": 0.01181598, + "balance_loss_clip": 1.00212657, + "balance_loss_mlp": 1.00154448, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.051272954738447, + "language_loss": 0.89278615, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91601068, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.612736701965332 + }, + { + "auxiliary_loss_clip": 0.01124828, + "auxiliary_loss_mlp": 0.01181565, + "balance_loss_clip": 1.00218487, + "balance_loss_mlp": 1.00141609, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.371580266453731, + "language_loss": 0.75258112, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77564514, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.7190544605255127 + }, + { + "auxiliary_loss_clip": 0.01124188, + "auxiliary_loss_mlp": 0.01181297, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00152922, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.8459108557224106, + "language_loss": 0.88425952, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90731442, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.6905529499053955 + }, + { + "auxiliary_loss_clip": 0.01190306, + "auxiliary_loss_mlp": 0.01181134, + "balance_loss_clip": 1.0024606, + "balance_loss_mlp": 1.00107992, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9868284471924174, + "language_loss": 0.88501239, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90872681, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.5180606842041016 + }, + { + "auxiliary_loss_clip": 0.01157619, + "auxiliary_loss_mlp": 0.0118152, + "balance_loss_clip": 1.00231743, + "balance_loss_mlp": 1.00146592, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.5846774419296485, + "language_loss": 0.93549359, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95888501, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.619372606277466 + }, + { + "auxiliary_loss_clip": 0.01174385, + "auxiliary_loss_mlp": 0.01181226, + "balance_loss_clip": 1.00245345, + "balance_loss_mlp": 1.00126767, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.205407341766835, + "language_loss": 0.72243881, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74599493, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.591688394546509 + }, + { + "auxiliary_loss_clip": 0.01157706, + "auxiliary_loss_mlp": 0.00749722, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00059104, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.6456464707713037, + "language_loss": 0.82465434, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84372866, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.6352555751800537 + }, + { + "auxiliary_loss_clip": 0.01157085, + "auxiliary_loss_mlp": 0.01180784, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00120676, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.6183161940753616, + "language_loss": 0.8174808, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84085941, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 4.082696437835693 + }, + { + "auxiliary_loss_clip": 0.01174313, + "auxiliary_loss_mlp": 0.01181058, + "balance_loss_clip": 1.00245702, + "balance_loss_mlp": 1.00119543, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.425020576367929, + "language_loss": 0.75743222, + "learning_rate": 3.994056467679221e-06, + "loss": 0.78098595, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 3.968019485473633 + }, + { + "auxiliary_loss_clip": 0.01157461, + "auxiliary_loss_mlp": 0.01181187, + "balance_loss_clip": 1.00255597, + "balance_loss_mlp": 1.00132418, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 1.9690607833892682, + "language_loss": 0.86348361, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88687009, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.761430501937866 + }, + { + "auxiliary_loss_clip": 0.01190417, + "auxiliary_loss_mlp": 0.00749709, + "balance_loss_clip": 1.00254524, + "balance_loss_mlp": 1.00060725, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.3782718784639343, + "language_loss": 0.8837316, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90313286, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 3.870805501937866 + }, + { + "auxiliary_loss_clip": 0.01173695, + "auxiliary_loss_mlp": 0.01180979, + "balance_loss_clip": 1.00234389, + "balance_loss_mlp": 1.0012114, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 2.4061732664195272, + "language_loss": 0.90200102, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92554778, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 4.004746913909912 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01180868, + "balance_loss_clip": 1.00244188, + "balance_loss_mlp": 1.00148225, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 2.753387007401455, + "language_loss": 0.91942668, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94281089, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.5468297004699707 + }, + { + "auxiliary_loss_clip": 0.01157725, + "auxiliary_loss_mlp": 0.01181394, + "balance_loss_clip": 1.00237155, + "balance_loss_mlp": 1.00153136, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.0607006678269872, + "language_loss": 0.75645936, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77985054, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.618691921234131 + }, + { + "auxiliary_loss_clip": 0.01174094, + "auxiliary_loss_mlp": 0.01180616, + "balance_loss_clip": 1.00240493, + "balance_loss_mlp": 1.0011344, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.757719824292926, + "language_loss": 0.74276626, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76631331, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.537519931793213 + }, + { + "auxiliary_loss_clip": 0.01140878, + "auxiliary_loss_mlp": 0.0118073, + "balance_loss_clip": 1.00223899, + "balance_loss_mlp": 1.00143957, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.25495285112664, + "language_loss": 0.84921104, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87242711, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.604551076889038 + }, + { + "auxiliary_loss_clip": 0.01124843, + "auxiliary_loss_mlp": 0.01181335, + "balance_loss_clip": 1.00220859, + "balance_loss_mlp": 1.00147164, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 1.8647569561142532, + "language_loss": 0.86498326, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88804507, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.710670232772827 + }, + { + "auxiliary_loss_clip": 0.01174064, + "auxiliary_loss_mlp": 0.01180887, + "balance_loss_clip": 1.00240088, + "balance_loss_mlp": 1.00121498, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.3420368805492857, + "language_loss": 0.74984896, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77339852, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.5532290935516357 + }, + { + "auxiliary_loss_clip": 0.01175725, + "auxiliary_loss_mlp": 0.0118123, + "balance_loss_clip": 1.00246882, + "balance_loss_mlp": 1.00136685, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.224803755639558, + "language_loss": 0.85979223, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88336176, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.565009832382202 + }, + { + "auxiliary_loss_clip": 0.01157432, + "auxiliary_loss_mlp": 0.01180681, + "balance_loss_clip": 1.00247014, + "balance_loss_mlp": 1.0016762, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.9601452568269802, + "language_loss": 0.74446654, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76784766, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.5950334072113037 + }, + { + "auxiliary_loss_clip": 0.01157233, + "auxiliary_loss_mlp": 0.01180945, + "balance_loss_clip": 1.00239158, + "balance_loss_mlp": 1.00146341, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.1061213646246064, + "language_loss": 0.87417197, + "learning_rate": 3.993690988180309e-06, + "loss": 0.8975538, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.6915199756622314 + }, + { + "auxiliary_loss_clip": 0.01173753, + "auxiliary_loss_mlp": 0.01181255, + "balance_loss_clip": 1.0023458, + "balance_loss_mlp": 1.00158262, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.7856029328106318, + "language_loss": 0.87046754, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89401758, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.5758206844329834 + }, + { + "auxiliary_loss_clip": 0.01157643, + "auxiliary_loss_mlp": 0.01180711, + "balance_loss_clip": 1.00243604, + "balance_loss_mlp": 1.00161088, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.39431328610274, + "language_loss": 0.90014768, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.92353117, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.580970048904419 + }, + { + "auxiliary_loss_clip": 0.01157336, + "auxiliary_loss_mlp": 0.01180764, + "balance_loss_clip": 1.00226891, + "balance_loss_mlp": 1.00156832, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.7055383387805936, + "language_loss": 0.70906717, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73244816, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.5732293128967285 + }, + { + "auxiliary_loss_clip": 0.01157412, + "auxiliary_loss_mlp": 0.0118047, + "balance_loss_clip": 1.0022819, + "balance_loss_mlp": 1.00108397, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.8884881692829878, + "language_loss": 0.83428037, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85765922, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.5843682289123535 + }, + { + "auxiliary_loss_clip": 0.01174246, + "auxiliary_loss_mlp": 0.01181289, + "balance_loss_clip": 1.00240779, + "balance_loss_mlp": 1.00152111, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 3.9960312787028807, + "language_loss": 0.7645849, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78814024, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.5451951026916504 + }, + { + "auxiliary_loss_clip": 0.01157129, + "auxiliary_loss_mlp": 0.0118036, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00106883, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.1992099131705682, + "language_loss": 0.8285948, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85196972, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.554002523422241 + }, + { + "auxiliary_loss_clip": 0.01174176, + "auxiliary_loss_mlp": 0.01180628, + "balance_loss_clip": 1.00265217, + "balance_loss_mlp": 1.00133753, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 1.8838328007172624, + "language_loss": 0.8331356, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85668361, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.606797933578491 + }, + { + "auxiliary_loss_clip": 0.01174003, + "auxiliary_loss_mlp": 0.00749701, + "balance_loss_clip": 1.00242162, + "balance_loss_mlp": 1.00063109, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.2099989227579155, + "language_loss": 0.90126896, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.920506, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.5956621170043945 + }, + { + "auxiliary_loss_clip": 0.01173573, + "auxiliary_loss_mlp": 0.01179974, + "balance_loss_clip": 1.00239038, + "balance_loss_mlp": 1.00115967, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.9666549953682002, + "language_loss": 0.89906412, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92259955, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.5392251014709473 + }, + { + "auxiliary_loss_clip": 0.01142442, + "auxiliary_loss_mlp": 0.01181071, + "balance_loss_clip": 1.00223112, + "balance_loss_mlp": 1.00130391, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 2.016860936040785, + "language_loss": 0.80062526, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386041, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.6399784088134766 + }, + { + "auxiliary_loss_clip": 0.01108489, + "auxiliary_loss_mlp": 0.01181165, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00158858, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 2.139381212954015, + "language_loss": 0.7978698, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.82076639, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.712284564971924 + }, + { + "auxiliary_loss_clip": 0.01173626, + "auxiliary_loss_mlp": 0.01180787, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00140071, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.8373394012146274, + "language_loss": 0.89184153, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91538572, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.609149932861328 + }, + { + "auxiliary_loss_clip": 0.01190205, + "auxiliary_loss_mlp": 0.01180993, + "balance_loss_clip": 1.00242996, + "balance_loss_mlp": 1.00151157, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.494611356394158, + "language_loss": 0.87458789, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89829981, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.607077121734619 + }, + { + "auxiliary_loss_clip": 0.01157396, + "auxiliary_loss_mlp": 0.01180634, + "balance_loss_clip": 1.00236535, + "balance_loss_mlp": 1.00096202, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.226232379058892, + "language_loss": 0.65830094, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68168122, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.7484376430511475 + }, + { + "auxiliary_loss_clip": 0.01174061, + "auxiliary_loss_mlp": 0.01180714, + "balance_loss_clip": 1.00254548, + "balance_loss_mlp": 1.00142288, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 1.9429687757107748, + "language_loss": 0.72286308, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74641085, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.5867528915405273 + }, + { + "auxiliary_loss_clip": 0.01157675, + "auxiliary_loss_mlp": 0.01180243, + "balance_loss_clip": 1.00232291, + "balance_loss_mlp": 1.00114274, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.535757446031341, + "language_loss": 0.81694317, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84032238, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.620349884033203 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01180116, + "balance_loss_clip": 1.00227702, + "balance_loss_mlp": 1.00120687, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.016441871281339, + "language_loss": 0.78716636, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81070316, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.5859570503234863 + }, + { + "auxiliary_loss_clip": 0.0114039, + "auxiliary_loss_mlp": 0.01180044, + "balance_loss_clip": 1.00189126, + "balance_loss_mlp": 1.00132537, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 2.141381528214351, + "language_loss": 1.02015042, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04335475, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.693161725997925 + }, + { + "auxiliary_loss_clip": 0.01125219, + "auxiliary_loss_mlp": 0.01179954, + "balance_loss_clip": 1.00227106, + "balance_loss_mlp": 1.00085449, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.0909967490512766, + "language_loss": 0.81253099, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83558273, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.7151384353637695 + }, + { + "auxiliary_loss_clip": 0.01173703, + "auxiliary_loss_mlp": 0.01180834, + "balance_loss_clip": 1.00236523, + "balance_loss_mlp": 1.00135267, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 3.1845872352426796, + "language_loss": 0.73422766, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75777304, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.5678677558898926 + }, + { + "auxiliary_loss_clip": 0.01191272, + "auxiliary_loss_mlp": 0.01181795, + "balance_loss_clip": 1.00350404, + "balance_loss_mlp": 1.00326717, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7851312039608823, + "language_loss": 0.59821737, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.621948, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.1556990146636963 + }, + { + "auxiliary_loss_clip": 0.01173333, + "auxiliary_loss_mlp": 0.01180206, + "balance_loss_clip": 1.00235581, + "balance_loss_mlp": 1.00139213, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 3.588815365272602, + "language_loss": 0.95236242, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97589785, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.571506977081299 + }, + { + "auxiliary_loss_clip": 0.0114071, + "auxiliary_loss_mlp": 0.01180631, + "balance_loss_clip": 1.00206816, + "balance_loss_mlp": 1.00134039, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 3.3241167710366177, + "language_loss": 0.7188465, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74205995, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.7453181743621826 + }, + { + "auxiliary_loss_clip": 0.01156678, + "auxiliary_loss_mlp": 0.0118037, + "balance_loss_clip": 1.00216436, + "balance_loss_mlp": 1.00127029, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 3.514619633305031, + "language_loss": 0.85616726, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87953776, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 3.9862167835235596 + }, + { + "auxiliary_loss_clip": 0.01175486, + "auxiliary_loss_mlp": 0.00749663, + "balance_loss_clip": 1.00248206, + "balance_loss_mlp": 1.00055659, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 2.1229999146421332, + "language_loss": 0.83625925, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85551071, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.6332569122314453 + }, + { + "auxiliary_loss_clip": 0.01173514, + "auxiliary_loss_mlp": 0.01180413, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00150406, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 1.8615484192622473, + "language_loss": 0.73710591, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76064515, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 4.067806959152222 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.01180316, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00140643, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 3.199381665736403, + "language_loss": 0.86950618, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89255297, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.6529176235198975 + }, + { + "auxiliary_loss_clip": 0.01124351, + "auxiliary_loss_mlp": 0.01179847, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00122428, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.6501198079115538, + "language_loss": 0.80111283, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82415479, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 4.165586233139038 + }, + { + "auxiliary_loss_clip": 0.01191685, + "auxiliary_loss_mlp": 0.01182841, + "balance_loss_clip": 1.0037055, + "balance_loss_mlp": 1.00431335, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8220681067717548, + "language_loss": 0.69162792, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71537322, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 3.0518834590911865 + }, + { + "auxiliary_loss_clip": 0.01190104, + "auxiliary_loss_mlp": 0.0118015, + "balance_loss_clip": 1.00247407, + "balance_loss_mlp": 1.00114584, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 3.5821138342129513, + "language_loss": 0.75844598, + "learning_rate": 3.992729665360331e-06, + "loss": 0.7821486, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.5202794075012207 + }, + { + "auxiliary_loss_clip": 0.0117511, + "auxiliary_loss_mlp": 0.01182314, + "balance_loss_clip": 1.00352347, + "balance_loss_mlp": 1.00378644, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8573657180251814, + "language_loss": 0.64333194, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66690618, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.086311101913452 + }, + { + "auxiliary_loss_clip": 0.01141287, + "auxiliary_loss_mlp": 0.01180369, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00107813, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 3.701504857642417, + "language_loss": 0.78994757, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81316411, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.617960214614868 + }, + { + "auxiliary_loss_clip": 0.01157823, + "auxiliary_loss_mlp": 0.01179952, + "balance_loss_clip": 1.00230455, + "balance_loss_mlp": 1.00132871, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 4.015300593816615, + "language_loss": 0.74031609, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76369381, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.658064842224121 + }, + { + "auxiliary_loss_clip": 0.011734, + "auxiliary_loss_mlp": 0.01179933, + "balance_loss_clip": 1.00233865, + "balance_loss_mlp": 1.00130939, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.8840976594828365, + "language_loss": 0.7058568, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72939014, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.592402219772339 + }, + { + "auxiliary_loss_clip": 0.0112474, + "auxiliary_loss_mlp": 0.01179732, + "balance_loss_clip": 1.00226521, + "balance_loss_mlp": 1.00139475, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 3.309472618573358, + "language_loss": 0.80707449, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83011919, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.7046241760253906 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01179564, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00103641, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.154799289420961, + "language_loss": 0.88411987, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90765101, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.6255319118499756 + }, + { + "auxiliary_loss_clip": 0.01173361, + "auxiliary_loss_mlp": 0.01180217, + "balance_loss_clip": 1.00226283, + "balance_loss_mlp": 1.00092638, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.9456356868028557, + "language_loss": 0.75268126, + "learning_rate": 3.992495569872206e-06, + "loss": 0.7762171, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.515385150909424 + }, + { + "auxiliary_loss_clip": 0.01173442, + "auxiliary_loss_mlp": 0.01180108, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.0011034, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7059207596088626, + "language_loss": 0.79857153, + "learning_rate": 3.992461825426906e-06, + "loss": 0.82210696, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.560800075531006 + }, + { + "auxiliary_loss_clip": 0.01173271, + "auxiliary_loss_mlp": 0.0117944, + "balance_loss_clip": 1.00223267, + "balance_loss_mlp": 1.00091243, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.5061249149602647, + "language_loss": 0.82880378, + "learning_rate": 3.992428005427252e-06, + "loss": 0.85233086, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.5322320461273193 + }, + { + "auxiliary_loss_clip": 0.01189935, + "auxiliary_loss_mlp": 0.01179649, + "balance_loss_clip": 1.00235415, + "balance_loss_mlp": 1.00093007, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 1.9710849928542853, + "language_loss": 0.79377079, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81746662, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.4832956790924072 + }, + { + "auxiliary_loss_clip": 0.01157601, + "auxiliary_loss_mlp": 0.01179728, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00100994, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 3.5596681860213155, + "language_loss": 0.86510706, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88848042, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.5565760135650635 + }, + { + "auxiliary_loss_clip": 0.01189896, + "auxiliary_loss_mlp": 0.01179778, + "balance_loss_clip": 1.00227821, + "balance_loss_mlp": 1.00125015, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 2.0821129174410475, + "language_loss": 0.87480813, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89850485, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.527097702026367 + }, + { + "auxiliary_loss_clip": 0.01173333, + "auxiliary_loss_mlp": 0.01179529, + "balance_loss_clip": 1.00234151, + "balance_loss_mlp": 1.00100088, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 15.981169777970617, + "language_loss": 0.78973818, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81326675, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.5225791931152344 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01179703, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00108016, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.4927578647817894, + "language_loss": 0.82186162, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84523308, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.6271657943725586 + }, + { + "auxiliary_loss_clip": 0.01157866, + "auxiliary_loss_mlp": 0.01180149, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.00095367, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.941128604437645, + "language_loss": 0.86296129, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88634145, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.6146304607391357 + }, + { + "auxiliary_loss_clip": 0.01156936, + "auxiliary_loss_mlp": 0.01179664, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00094533, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 2.003809601286518, + "language_loss": 0.79254091, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81590688, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.606004238128662 + }, + { + "auxiliary_loss_clip": 0.01157391, + "auxiliary_loss_mlp": 0.01180081, + "balance_loss_clip": 1.00224543, + "balance_loss_mlp": 1.00117171, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 4.311457261773988, + "language_loss": 0.86958075, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89295542, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.5816166400909424 + }, + { + "auxiliary_loss_clip": 0.01173548, + "auxiliary_loss_mlp": 0.0117999, + "balance_loss_clip": 1.00226879, + "balance_loss_mlp": 1.00089025, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.507121747267289, + "language_loss": 0.88207573, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.9056111, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.5607707500457764 + }, + { + "auxiliary_loss_clip": 0.01157443, + "auxiliary_loss_mlp": 0.01179748, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 1.00102925, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.311029159926683, + "language_loss": 0.89792389, + "learning_rate": 3.992085650224914e-06, + "loss": 0.92129576, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.5570902824401855 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01179464, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00103176, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 2.6001899979226284, + "language_loss": 0.75773537, + "learning_rate": 3.99205099921266e-06, + "loss": 0.78093159, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.673386335372925 + }, + { + "auxiliary_loss_clip": 0.01140738, + "auxiliary_loss_mlp": 0.01179644, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00111639, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.337222339972761, + "language_loss": 0.80178005, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82498384, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.6140296459198 + }, + { + "auxiliary_loss_clip": 0.01156324, + "auxiliary_loss_mlp": 0.01179801, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00098729, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.9321123244635072, + "language_loss": 0.88298666, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90634799, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.580505609512329 + }, + { + "auxiliary_loss_clip": 0.01140431, + "auxiliary_loss_mlp": 0.01179769, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00105071, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.2600521016970907, + "language_loss": 0.78519416, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083961, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.70015549659729 + }, + { + "auxiliary_loss_clip": 0.01108106, + "auxiliary_loss_mlp": 0.01179638, + "balance_loss_clip": 1.00219381, + "balance_loss_mlp": 1.00120521, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.581680425065626, + "language_loss": 0.92506373, + "learning_rate": 3.991911639789094e-06, + "loss": 0.94794118, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.712693214416504 + }, + { + "auxiliary_loss_clip": 0.01157159, + "auxiliary_loss_mlp": 0.01179743, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00111961, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.3768803464422628, + "language_loss": 0.68358105, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70695013, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.6530513763427734 + }, + { + "auxiliary_loss_clip": 0.01140797, + "auxiliary_loss_mlp": 0.01179602, + "balance_loss_clip": 1.00212574, + "balance_loss_mlp": 1.00126445, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.614421130280558, + "language_loss": 0.88587868, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90908265, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.6047041416168213 + }, + { + "auxiliary_loss_clip": 0.01157521, + "auxiliary_loss_mlp": 0.01179987, + "balance_loss_clip": 1.00243449, + "balance_loss_mlp": 1.00107813, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.6768719922088824, + "language_loss": 0.8478483, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87122339, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.6149420738220215 + }, + { + "auxiliary_loss_clip": 0.0115647, + "auxiliary_loss_mlp": 0.01179729, + "balance_loss_clip": 1.00206816, + "balance_loss_mlp": 1.0012964, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 5.926918088481862, + "language_loss": 0.77340978, + "learning_rate": 3.99177107182976e-06, + "loss": 0.79677182, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.562822103500366 + }, + { + "auxiliary_loss_clip": 0.01142347, + "auxiliary_loss_mlp": 0.01179432, + "balance_loss_clip": 1.0022893, + "balance_loss_mlp": 1.00109529, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4676258183379316, + "language_loss": 0.81664068, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83985853, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.63021183013916 + }, + { + "auxiliary_loss_clip": 0.01173207, + "auxiliary_loss_mlp": 0.01179604, + "balance_loss_clip": 1.00225401, + "balance_loss_mlp": 1.00107598, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.909114108238012, + "language_loss": 0.76717532, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.79070342, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.5644383430480957 + }, + { + "auxiliary_loss_clip": 0.01174293, + "auxiliary_loss_mlp": 0.01178823, + "balance_loss_clip": 1.00285196, + "balance_loss_mlp": 1.0010581, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.8068341149751788, + "language_loss": 0.57345241, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59698361, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.0474793910980225 + }, + { + "auxiliary_loss_clip": 0.01156775, + "auxiliary_loss_mlp": 0.01179472, + "balance_loss_clip": 1.00224578, + "balance_loss_mlp": 1.00103915, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.8896501703759836, + "language_loss": 0.82680458, + "learning_rate": 3.991629295419945e-06, + "loss": 0.8501671, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.5909879207611084 + }, + { + "auxiliary_loss_clip": 0.01173211, + "auxiliary_loss_mlp": 0.00749742, + "balance_loss_clip": 1.00222588, + "balance_loss_mlp": 1.00068665, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 4.735401870843298, + "language_loss": 0.77815735, + "learning_rate": 3.991593662507167e-06, + "loss": 0.79738688, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 4.103189468383789 + }, + { + "auxiliary_loss_clip": 0.01140625, + "auxiliary_loss_mlp": 0.01179683, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00105953, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 4.431745543197052, + "language_loss": 0.91704977, + "learning_rate": 3.991557954072958e-06, + "loss": 0.9402529, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 4.306742191314697 + }, + { + "auxiliary_loss_clip": 0.01156683, + "auxiliary_loss_mlp": 0.01179411, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00107408, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 2.0495273234335505, + "language_loss": 0.85843658, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88179755, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.6395840644836426 + }, + { + "auxiliary_loss_clip": 0.01140038, + "auxiliary_loss_mlp": 0.01179572, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.0011394, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.5201129269534026, + "language_loss": 0.87458086, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89777696, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 5.460866928100586 + }, + { + "auxiliary_loss_clip": 0.01173367, + "auxiliary_loss_mlp": 0.00749775, + "balance_loss_clip": 1.00228226, + "balance_loss_mlp": 1.00082374, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 4.489822980593569, + "language_loss": 0.75077266, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77000409, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.563852071762085 + }, + { + "auxiliary_loss_clip": 0.01173197, + "auxiliary_loss_mlp": 0.00749766, + "balance_loss_clip": 1.00234771, + "balance_loss_mlp": 1.00079083, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.9304169733976306, + "language_loss": 0.76969391, + "learning_rate": 3.991414365148936e-06, + "loss": 0.7889235, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.73042368888855 + }, + { + "auxiliary_loss_clip": 0.01189751, + "auxiliary_loss_mlp": 0.01179677, + "balance_loss_clip": 1.00232589, + "balance_loss_mlp": 1.00086308, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 1.9548479490388462, + "language_loss": 0.7677846, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79147887, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.506303071975708 + }, + { + "auxiliary_loss_clip": 0.01156769, + "auxiliary_loss_mlp": 0.01179697, + "balance_loss_clip": 1.00219345, + "balance_loss_mlp": 1.00116932, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 3.0718465441190594, + "language_loss": 0.87561023, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89897496, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.707192897796631 + }, + { + "auxiliary_loss_clip": 0.01156863, + "auxiliary_loss_mlp": 0.01179112, + "balance_loss_clip": 1.00241804, + "balance_loss_mlp": 1.00125146, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 2.1091500272681802, + "language_loss": 0.79290295, + "learning_rate": 3.991305880547527e-06, + "loss": 0.8162626, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.6039419174194336 + }, + { + "auxiliary_loss_clip": 0.0109234, + "auxiliary_loss_mlp": 0.01179413, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00117111, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 3.4038851368325926, + "language_loss": 0.81100488, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83372247, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 2.9471521377563477 + }, + { + "auxiliary_loss_clip": 0.01158589, + "auxiliary_loss_mlp": 0.01178574, + "balance_loss_clip": 1.00302529, + "balance_loss_mlp": 1.00080895, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9374419345750108, + "language_loss": 0.59007668, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61344838, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.3433642387390137 + }, + { + "auxiliary_loss_clip": 0.01189745, + "auxiliary_loss_mlp": 0.01179639, + "balance_loss_clip": 1.00247371, + "balance_loss_mlp": 1.00120616, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 3.2257850798723298, + "language_loss": 0.86989939, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.89359319, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.4964065551757812 + }, + { + "auxiliary_loss_clip": 0.01156799, + "auxiliary_loss_mlp": 0.01179472, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00103974, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.0691109984809577, + "language_loss": 0.79900193, + "learning_rate": 3.991160177271513e-06, + "loss": 0.82236469, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.6186962127685547 + }, + { + "auxiliary_loss_clip": 0.01158522, + "auxiliary_loss_mlp": 0.01179566, + "balance_loss_clip": 1.00222564, + "balance_loss_mlp": 1.00103807, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 16.27336338059362, + "language_loss": 0.84680337, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.8701843, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.6349363327026367 + }, + { + "auxiliary_loss_clip": 0.01173391, + "auxiliary_loss_mlp": 0.0117906, + "balance_loss_clip": 1.00240278, + "balance_loss_mlp": 1.00100899, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 2.238383835942545, + "language_loss": 0.84537524, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.8688997, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.5599164962768555 + }, + { + "auxiliary_loss_clip": 0.01173582, + "auxiliary_loss_mlp": 0.01179549, + "balance_loss_clip": 1.00247741, + "balance_loss_mlp": 1.00111663, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 3.83423333000126, + "language_loss": 0.77353752, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79706889, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.584977626800537 + }, + { + "auxiliary_loss_clip": 0.01124759, + "auxiliary_loss_mlp": 0.0117921, + "balance_loss_clip": 1.00224149, + "balance_loss_mlp": 1.00087321, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.095808296444459, + "language_loss": 0.90816498, + "learning_rate": 3.991013265915661e-06, + "loss": 0.93120468, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.6660451889038086 + }, + { + "auxiliary_loss_clip": 0.01173467, + "auxiliary_loss_mlp": 0.01179537, + "balance_loss_clip": 1.0023303, + "balance_loss_mlp": 1.0009141, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.9857100817603777, + "language_loss": 0.75929379, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.7828238, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.6193759441375732 + }, + { + "auxiliary_loss_clip": 0.01173072, + "auxiliary_loss_mlp": 0.01179352, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00091982, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.006385720225806, + "language_loss": 0.713368, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73689222, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.694159984588623 + }, + { + "auxiliary_loss_clip": 0.01141678, + "auxiliary_loss_mlp": 0.01181151, + "balance_loss_clip": 1.0039022, + "balance_loss_mlp": 1.00414872, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9376967774776596, + "language_loss": 0.71083748, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73406577, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.083739995956421 + }, + { + "auxiliary_loss_clip": 0.01124678, + "auxiliary_loss_mlp": 0.01179715, + "balance_loss_clip": 1.00210738, + "balance_loss_mlp": 1.00118756, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 2.315383532982229, + "language_loss": 0.78550017, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80854404, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.721280336380005 + }, + { + "auxiliary_loss_clip": 0.01173148, + "auxiliary_loss_mlp": 0.01178858, + "balance_loss_clip": 1.00244713, + "balance_loss_mlp": 1.00109363, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 1.958275349957068, + "language_loss": 0.8627553, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88627541, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.5610382556915283 + }, + { + "auxiliary_loss_clip": 0.01189705, + "auxiliary_loss_mlp": 0.01179437, + "balance_loss_clip": 1.00237501, + "balance_loss_mlp": 1.00109982, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.704187593774545, + "language_loss": 0.76729321, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79098463, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.51115083694458 + }, + { + "auxiliary_loss_clip": 0.01123812, + "auxiliary_loss_mlp": 0.01179327, + "balance_loss_clip": 1.00204873, + "balance_loss_mlp": 1.00099015, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.5013231201483945, + "language_loss": 0.7479127, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77094412, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.6478238105773926 + }, + { + "auxiliary_loss_clip": 0.0114081, + "auxiliary_loss_mlp": 0.01179531, + "balance_loss_clip": 1.00229752, + "balance_loss_mlp": 1.00119376, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 2.6284997603340003, + "language_loss": 0.78856921, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81177264, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.6990463733673096 + }, + { + "auxiliary_loss_clip": 0.01189621, + "auxiliary_loss_mlp": 0.01179618, + "balance_loss_clip": 1.0024116, + "balance_loss_mlp": 1.00137591, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 14.330894123957727, + "language_loss": 0.80084825, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82454067, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.593158721923828 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01179007, + "balance_loss_clip": 1.00225556, + "balance_loss_mlp": 1.00095582, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 1.9159516838760426, + "language_loss": 0.86584949, + "learning_rate": 3.990640702763487e-06, + "loss": 0.88904017, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.6636617183685303 + }, + { + "auxiliary_loss_clip": 0.01140984, + "auxiliary_loss_mlp": 0.01179307, + "balance_loss_clip": 1.00227749, + "balance_loss_mlp": 1.00116062, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.9320832402724, + "language_loss": 0.88063413, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90383703, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.6532506942749023 + }, + { + "auxiliary_loss_clip": 0.01160557, + "auxiliary_loss_mlp": 0.01178255, + "balance_loss_clip": 1.00376678, + "balance_loss_mlp": 1.00201559, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0163110247762455, + "language_loss": 0.75374579, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77713394, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.244701385498047 + }, + { + "auxiliary_loss_clip": 0.01140743, + "auxiliary_loss_mlp": 0.01179236, + "balance_loss_clip": 1.00236607, + "balance_loss_mlp": 1.00147069, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.772798290445679, + "language_loss": 0.75776762, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78096741, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.6738758087158203 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.0117909, + "balance_loss_clip": 1.00221372, + "balance_loss_mlp": 1.00075293, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 2.2493395703994694, + "language_loss": 0.82690126, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85042709, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.5928313732147217 + }, + { + "auxiliary_loss_clip": 0.01157343, + "auxiliary_loss_mlp": 0.011794, + "balance_loss_clip": 1.00241256, + "balance_loss_mlp": 1.00106263, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.7267365331591606, + "language_loss": 0.86032718, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88369465, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.6289780139923096 + }, + { + "auxiliary_loss_clip": 0.01156763, + "auxiliary_loss_mlp": 0.0117927, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00102866, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.3999286552883934, + "language_loss": 0.74376935, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76712966, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.6062707901000977 + }, + { + "auxiliary_loss_clip": 0.01189424, + "auxiliary_loss_mlp": 0.01179341, + "balance_loss_clip": 1.00233638, + "balance_loss_mlp": 1.0010035, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.445338644108126, + "language_loss": 0.75391674, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77760446, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.5398621559143066 + }, + { + "auxiliary_loss_clip": 0.01156518, + "auxiliary_loss_mlp": 0.01178846, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00089002, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 3.534258410151536, + "language_loss": 0.70174503, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72509873, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.5762293338775635 + }, + { + "auxiliary_loss_clip": 0.01173323, + "auxiliary_loss_mlp": 0.01179307, + "balance_loss_clip": 1.00230098, + "balance_loss_mlp": 1.00116062, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.7080582775053617, + "language_loss": 0.83565158, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85917783, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.528505802154541 + }, + { + "auxiliary_loss_clip": 0.01173712, + "auxiliary_loss_mlp": 0.01177829, + "balance_loss_clip": 1.00276792, + "balance_loss_mlp": 1.00006378, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8958799915085559, + "language_loss": 0.58936101, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61287642, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.1536762714385986 + }, + { + "auxiliary_loss_clip": 0.01157148, + "auxiliary_loss_mlp": 0.01179009, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00076783, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 4.874576520339968, + "language_loss": 0.74595839, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76932001, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 4.16223931312561 + }, + { + "auxiliary_loss_clip": 0.0115681, + "auxiliary_loss_mlp": 0.01179116, + "balance_loss_clip": 1.0021894, + "balance_loss_mlp": 1.00106537, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.93689145454545, + "language_loss": 0.80868006, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.8320393, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.616338014602661 + }, + { + "auxiliary_loss_clip": 0.01141815, + "auxiliary_loss_mlp": 0.0117907, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00111461, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 3.5775005209322157, + "language_loss": 0.78477645, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80798531, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 4.124775171279907 + }, + { + "auxiliary_loss_clip": 0.01173262, + "auxiliary_loss_mlp": 0.01178891, + "balance_loss_clip": 1.00231278, + "balance_loss_mlp": 1.00093496, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.291518660935283, + "language_loss": 0.93213844, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95565999, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.577314853668213 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.00749699, + "balance_loss_clip": 1.00224555, + "balance_loss_mlp": 1.00065494, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1332104543733035, + "language_loss": 0.71559769, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.73417383, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 5.594704627990723 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01179148, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.00100195, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.476708202884834, + "language_loss": 0.87253475, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89605844, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.5907955169677734 + }, + { + "auxiliary_loss_clip": 0.01173301, + "auxiliary_loss_mlp": 0.01179159, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00120366, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 2.2184933626616674, + "language_loss": 0.76682407, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79034865, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.5797924995422363 + }, + { + "auxiliary_loss_clip": 0.01172729, + "auxiliary_loss_mlp": 0.0074967, + "balance_loss_clip": 1.00222874, + "balance_loss_mlp": 1.00066006, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.9290390515586013, + "language_loss": 0.85747373, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87669766, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.540783166885376 + }, + { + "auxiliary_loss_clip": 0.01173035, + "auxiliary_loss_mlp": 0.01179294, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00114775, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.625117620633729, + "language_loss": 0.73133111, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75485432, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.586988925933838 + }, + { + "auxiliary_loss_clip": 0.01156067, + "auxiliary_loss_mlp": 0.01178571, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.0010922, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 2.033331365440628, + "language_loss": 0.79060245, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81394881, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.6194303035736084 + }, + { + "auxiliary_loss_clip": 0.01140081, + "auxiliary_loss_mlp": 0.01178671, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00100183, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 3.000122146239047, + "language_loss": 0.76101446, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78420204, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.6890196800231934 + }, + { + "auxiliary_loss_clip": 0.01156973, + "auxiliary_loss_mlp": 0.01179269, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.00131357, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.388881028412724, + "language_loss": 0.86100364, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436604, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.580568552017212 + }, + { + "auxiliary_loss_clip": 0.01141867, + "auxiliary_loss_mlp": 0.01179096, + "balance_loss_clip": 1.00229883, + "balance_loss_mlp": 1.00133169, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 4.698487341251657, + "language_loss": 0.77092588, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79413557, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.600680351257324 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01179045, + "balance_loss_clip": 1.00204611, + "balance_loss_mlp": 1.00099385, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 3.1751420827122954, + "language_loss": 0.84089345, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86391819, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.704599142074585 + }, + { + "auxiliary_loss_clip": 0.01173095, + "auxiliary_loss_mlp": 0.01178827, + "balance_loss_clip": 1.00227332, + "balance_loss_mlp": 1.00106251, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.000656359546959, + "language_loss": 0.78994179, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81346095, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.7275171279907227 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01179461, + "balance_loss_clip": 1.00222111, + "balance_loss_mlp": 1.00140965, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 3.229924235729176, + "language_loss": 0.8767134, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90008044, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.5559630393981934 + }, + { + "auxiliary_loss_clip": 0.0115859, + "auxiliary_loss_mlp": 0.01178845, + "balance_loss_clip": 1.002249, + "balance_loss_mlp": 1.00108027, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 2.8144423398849474, + "language_loss": 0.83011365, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85348809, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.6481516361236572 + }, + { + "auxiliary_loss_clip": 0.0119011, + "auxiliary_loss_mlp": 0.01177067, + "balance_loss_clip": 1.00287938, + "balance_loss_mlp": 1.00006473, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.891713648767154, + "language_loss": 0.650774, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67444575, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.116370916366577 + }, + { + "auxiliary_loss_clip": 0.01157329, + "auxiliary_loss_mlp": 0.01179342, + "balance_loss_clip": 1.00236785, + "balance_loss_mlp": 1.00129104, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.5047795342591814, + "language_loss": 0.88215345, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90552008, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.01156618, + "auxiliary_loss_mlp": 0.01179017, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00106144, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 3.2285227108060908, + "language_loss": 0.84704089, + "learning_rate": 3.989477727938335e-06, + "loss": 0.87039721, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.6209030151367188 + }, + { + "auxiliary_loss_clip": 0.01140712, + "auxiliary_loss_mlp": 0.01178917, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.0009613, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 3.5654842342260533, + "language_loss": 0.82103616, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84423244, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.592902898788452 + }, + { + "auxiliary_loss_clip": 0.01123352, + "auxiliary_loss_mlp": 0.01178867, + "balance_loss_clip": 1.00224841, + "balance_loss_mlp": 1.00091183, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.3439471053826146, + "language_loss": 0.84308738, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86610961, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.619607448577881 + }, + { + "auxiliary_loss_clip": 0.01156329, + "auxiliary_loss_mlp": 0.01177013, + "balance_loss_clip": 1.0023464, + "balance_loss_mlp": 1.00001132, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9871736711963918, + "language_loss": 0.60445702, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62779045, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 2.9659643173217773 + }, + { + "auxiliary_loss_clip": 0.01156391, + "auxiliary_loss_mlp": 0.01179096, + "balance_loss_clip": 1.00217557, + "balance_loss_mlp": 1.00123537, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.4280137458865867, + "language_loss": 0.82117069, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84452558, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.5924735069274902 + }, + { + "auxiliary_loss_clip": 0.01189297, + "auxiliary_loss_mlp": 0.01179006, + "balance_loss_clip": 1.0024209, + "balance_loss_mlp": 1.00124073, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 4.521187985672719, + "language_loss": 0.79622698, + "learning_rate": 3.989277296609237e-06, + "loss": 0.81990999, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.4891703128814697 + }, + { + "auxiliary_loss_clip": 0.0115624, + "auxiliary_loss_mlp": 0.01178947, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00127721, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.9706279192791905, + "language_loss": 0.7737962, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79714799, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.5809152126312256 + }, + { + "auxiliary_loss_clip": 0.0117313, + "auxiliary_loss_mlp": 0.01178957, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00128794, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.427268236460936, + "language_loss": 0.89058995, + "learning_rate": 3.989196596031776e-06, + "loss": 0.9141109, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.52996826171875 + }, + { + "auxiliary_loss_clip": 0.0117298, + "auxiliary_loss_mlp": 0.01179013, + "balance_loss_clip": 1.00219011, + "balance_loss_mlp": 1.001248, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.3668717598729567, + "language_loss": 0.85229075, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87581074, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.6456432342529297 + }, + { + "auxiliary_loss_clip": 0.01156346, + "auxiliary_loss_mlp": 0.01178937, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.0013628, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 2.012255318636986, + "language_loss": 0.81113768, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83449054, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.700937271118164 + }, + { + "auxiliary_loss_clip": 0.01123473, + "auxiliary_loss_mlp": 0.01178755, + "balance_loss_clip": 1.00220442, + "balance_loss_mlp": 1.00118065, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.176153015795921, + "language_loss": 0.77895623, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80197853, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.6903960704803467 + }, + { + "auxiliary_loss_clip": 0.01172768, + "auxiliary_loss_mlp": 0.01179026, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00126076, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.7873726029003518, + "language_loss": 0.86477643, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88829434, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.6030642986297607 + }, + { + "auxiliary_loss_clip": 0.01172598, + "auxiliary_loss_mlp": 0.01178774, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00110435, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.60362789701352, + "language_loss": 0.81052965, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83404338, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.5743815898895264 + }, + { + "auxiliary_loss_clip": 0.01140456, + "auxiliary_loss_mlp": 0.01179009, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00133908, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 3.4037579260278368, + "language_loss": 0.85841602, + "learning_rate": 3.98895268401578e-06, + "loss": 0.88161063, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.6352946758270264 + }, + { + "auxiliary_loss_clip": 0.01156505, + "auxiliary_loss_mlp": 0.01179208, + "balance_loss_clip": 1.0022254, + "balance_loss_mlp": 1.00134742, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 2.663239604449904, + "language_loss": 0.8081162, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83147335, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.584519624710083 + }, + { + "auxiliary_loss_clip": 0.01189609, + "auxiliary_loss_mlp": 0.01178915, + "balance_loss_clip": 1.00265503, + "balance_loss_mlp": 1.00115061, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 4.33274682661806, + "language_loss": 0.69690633, + "learning_rate": 3.988870776623685e-06, + "loss": 0.7205916, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.5640952587127686 + }, + { + "auxiliary_loss_clip": 0.01189253, + "auxiliary_loss_mlp": 0.01178728, + "balance_loss_clip": 1.00233293, + "balance_loss_mlp": 1.00124967, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 4.521190014119791, + "language_loss": 0.81400365, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83768344, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.5235788822174072 + }, + { + "auxiliary_loss_clip": 0.01189237, + "auxiliary_loss_mlp": 0.01178439, + "balance_loss_clip": 1.00241184, + "balance_loss_mlp": 1.00095987, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 2.528066890688475, + "language_loss": 0.76186305, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78553975, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.6803436279296875 + }, + { + "auxiliary_loss_clip": 0.01172849, + "auxiliary_loss_mlp": 0.01178931, + "balance_loss_clip": 1.0024178, + "balance_loss_mlp": 1.00145185, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 3.787318727890818, + "language_loss": 0.92522895, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94874668, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.5548629760742188 + }, + { + "auxiliary_loss_clip": 0.01172573, + "auxiliary_loss_mlp": 0.01178903, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00132918, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 2.3589110196053067, + "language_loss": 0.85782838, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88134313, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 3.993518352508545 + }, + { + "auxiliary_loss_clip": 0.01156187, + "auxiliary_loss_mlp": 0.0117888, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00130558, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.905021325358851, + "language_loss": 0.78492892, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80827951, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 4.207250118255615 + }, + { + "auxiliary_loss_clip": 0.01172789, + "auxiliary_loss_mlp": 0.01179071, + "balance_loss_clip": 1.00246096, + "balance_loss_mlp": 1.0015924, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 3.2129932958525838, + "language_loss": 0.77149546, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79501402, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.5751233100891113 + }, + { + "auxiliary_loss_clip": 0.01172917, + "auxiliary_loss_mlp": 0.01178946, + "balance_loss_clip": 1.00225377, + "balance_loss_mlp": 1.00137222, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.7653336112584377, + "language_loss": 0.77471632, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79823494, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 4.21151065826416 + }, + { + "auxiliary_loss_clip": 0.01156052, + "auxiliary_loss_mlp": 0.01179151, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00148106, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 4.110683818507698, + "language_loss": 0.77506185, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79841387, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 3.977524518966675 + }, + { + "auxiliary_loss_clip": 0.0117269, + "auxiliary_loss_mlp": 0.01178857, + "balance_loss_clip": 1.00234628, + "balance_loss_mlp": 1.0011878, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.8210351863617227, + "language_loss": 0.83089554, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85441101, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.5666966438293457 + }, + { + "auxiliary_loss_clip": 0.01189323, + "auxiliary_loss_mlp": 0.01178787, + "balance_loss_clip": 1.00264084, + "balance_loss_mlp": 1.00130868, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 4.281321724662509, + "language_loss": 0.77469212, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79837322, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.5421671867370605 + }, + { + "auxiliary_loss_clip": 0.01156131, + "auxiliary_loss_mlp": 0.01179009, + "balance_loss_clip": 1.00227618, + "balance_loss_mlp": 1.00152981, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 4.062323063126787, + "language_loss": 0.80256641, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82591784, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.624359130859375 + }, + { + "auxiliary_loss_clip": 0.01189402, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_clip": 1.00258744, + "balance_loss_mlp": 1.00120759, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 3.5538433394973885, + "language_loss": 0.78489804, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80857992, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.503617525100708 + }, + { + "auxiliary_loss_clip": 0.01156407, + "auxiliary_loss_mlp": 0.00749609, + "balance_loss_clip": 1.0024879, + "balance_loss_mlp": 1.00063944, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 2.2710879192753817, + "language_loss": 0.84598553, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86504573, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.589231014251709 + }, + { + "auxiliary_loss_clip": 0.0115706, + "auxiliary_loss_mlp": 0.01178676, + "balance_loss_clip": 1.00251448, + "balance_loss_mlp": 1.00138807, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 3.577831134357695, + "language_loss": 0.8574999, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.88085723, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.558166742324829 + }, + { + "auxiliary_loss_clip": 0.01140246, + "auxiliary_loss_mlp": 0.01178616, + "balance_loss_clip": 1.00236344, + "balance_loss_mlp": 1.00094652, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.6720916261875116, + "language_loss": 0.80768186, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83087051, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.641043186187744 + }, + { + "auxiliary_loss_clip": 0.01107363, + "auxiliary_loss_mlp": 0.01178661, + "balance_loss_clip": 1.0022068, + "balance_loss_mlp": 1.00108671, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 3.7200742321511933, + "language_loss": 0.81134665, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83420688, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.7686526775360107 + }, + { + "auxiliary_loss_clip": 0.01142953, + "auxiliary_loss_mlp": 0.01178551, + "balance_loss_clip": 1.00245357, + "balance_loss_mlp": 1.00154877, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 3.0954446778159634, + "language_loss": 0.83407688, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85729194, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.617368221282959 + }, + { + "auxiliary_loss_clip": 0.01156243, + "auxiliary_loss_mlp": 0.01178746, + "balance_loss_clip": 1.0022769, + "balance_loss_mlp": 1.0013628, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 3.0855859723395014, + "language_loss": 0.87563455, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89898449, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.704163074493408 + }, + { + "auxiliary_loss_clip": 0.0114024, + "auxiliary_loss_mlp": 0.01178853, + "balance_loss_clip": 1.00251436, + "balance_loss_mlp": 1.00127864, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 4.260110014952335, + "language_loss": 0.91490436, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93809527, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.639188289642334 + }, + { + "auxiliary_loss_clip": 0.01156773, + "auxiliary_loss_mlp": 0.01178448, + "balance_loss_clip": 1.00242066, + "balance_loss_mlp": 1.00106454, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 2.0551931535210213, + "language_loss": 0.86569643, + "learning_rate": 3.988035112776035e-06, + "loss": 0.88904864, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.5607898235321045 + }, + { + "auxiliary_loss_clip": 0.01156293, + "auxiliary_loss_mlp": 0.01178609, + "balance_loss_clip": 1.00239933, + "balance_loss_mlp": 1.00132132, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 3.1829550868938976, + "language_loss": 0.77723581, + "learning_rate": 3.987992537919185e-06, + "loss": 0.80058479, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.65598201751709 + }, + { + "auxiliary_loss_clip": 0.01139856, + "auxiliary_loss_mlp": 0.01178354, + "balance_loss_clip": 1.0022198, + "balance_loss_mlp": 1.00125647, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 9.061507304588414, + "language_loss": 0.86157364, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88475573, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.6322922706604004 + }, + { + "auxiliary_loss_clip": 0.01189443, + "auxiliary_loss_mlp": 0.01178737, + "balance_loss_clip": 1.00258088, + "balance_loss_mlp": 1.00135362, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 5.563101742772916, + "language_loss": 0.80596721, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82964897, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.5449957847595215 + }, + { + "auxiliary_loss_clip": 0.01173455, + "auxiliary_loss_mlp": 0.01178702, + "balance_loss_clip": 1.00277972, + "balance_loss_mlp": 1.00141382, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 3.598793571127478, + "language_loss": 0.8443113, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86783284, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.5549139976501465 + }, + { + "auxiliary_loss_clip": 0.01140425, + "auxiliary_loss_mlp": 0.01178871, + "balance_loss_clip": 1.00248468, + "balance_loss_mlp": 1.00148773, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 3.0869580042127893, + "language_loss": 0.68783474, + "learning_rate": 3.987821484659211e-06, + "loss": 0.71102762, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.782362461090088 + }, + { + "auxiliary_loss_clip": 0.01189479, + "auxiliary_loss_mlp": 0.01178563, + "balance_loss_clip": 1.00266349, + "balance_loss_mlp": 1.00146568, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.0784341497942695, + "language_loss": 0.90023613, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92391658, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.507833957672119 + }, + { + "auxiliary_loss_clip": 0.01156223, + "auxiliary_loss_mlp": 0.01178658, + "balance_loss_clip": 1.00243521, + "balance_loss_mlp": 1.00146556, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.1596088728285694, + "language_loss": 0.83056599, + "learning_rate": 3.987735505752391e-06, + "loss": 0.8539148, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.5714986324310303 + }, + { + "auxiliary_loss_clip": 0.01156977, + "auxiliary_loss_mlp": 0.01178762, + "balance_loss_clip": 1.0026809, + "balance_loss_mlp": 1.00147414, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 11.571835862005848, + "language_loss": 0.89755023, + "learning_rate": 3.987692403235471e-06, + "loss": 0.92090762, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.6480259895324707 + }, + { + "auxiliary_loss_clip": 0.01158392, + "auxiliary_loss_mlp": 0.011785, + "balance_loss_clip": 1.00239968, + "balance_loss_mlp": 1.00140321, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 3.256378907187184, + "language_loss": 0.95485073, + "learning_rate": 3.987649225345056e-06, + "loss": 0.97821963, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.547938108444214 + }, + { + "auxiliary_loss_clip": 0.0112376, + "auxiliary_loss_mlp": 0.01178366, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.00117302, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.9373048426084851, + "language_loss": 0.87924969, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90227097, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.6802520751953125 + }, + { + "auxiliary_loss_clip": 0.01140697, + "auxiliary_loss_mlp": 0.01178733, + "balance_loss_clip": 1.00242925, + "balance_loss_mlp": 1.00153983, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 2.2594051969926423, + "language_loss": 0.76233166, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78552592, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.6477298736572266 + }, + { + "auxiliary_loss_clip": 0.01156986, + "auxiliary_loss_mlp": 0.01178638, + "balance_loss_clip": 1.00257277, + "balance_loss_mlp": 1.00134945, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 2.9038234273913885, + "language_loss": 0.80895507, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83231133, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.623351573944092 + }, + { + "auxiliary_loss_clip": 0.01172818, + "auxiliary_loss_mlp": 0.01178323, + "balance_loss_clip": 1.00260901, + "balance_loss_mlp": 1.00113058, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 3.4312534045896017, + "language_loss": 0.80600786, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82951927, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.612534761428833 + }, + { + "auxiliary_loss_clip": 0.01156803, + "auxiliary_loss_mlp": 0.01178541, + "balance_loss_clip": 1.00238657, + "balance_loss_mlp": 1.0011574, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.165993337490042, + "language_loss": 0.79013157, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81348503, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.5617482662200928 + }, + { + "auxiliary_loss_clip": 0.01157292, + "auxiliary_loss_mlp": 0.01178234, + "balance_loss_clip": 1.00254166, + "balance_loss_mlp": 1.00113654, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 3.6103650072102598, + "language_loss": 0.87681735, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90017259, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.5992157459259033 + }, + { + "auxiliary_loss_clip": 0.0117333, + "auxiliary_loss_mlp": 0.01178303, + "balance_loss_clip": 1.00263524, + "balance_loss_mlp": 1.00111067, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.9394974590537, + "language_loss": 0.80675709, + "learning_rate": 3.98734486979218e-06, + "loss": 0.83027351, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.5315423011779785 + }, + { + "auxiliary_loss_clip": 0.01156965, + "auxiliary_loss_mlp": 0.01178454, + "balance_loss_clip": 1.00260699, + "balance_loss_mlp": 1.00107098, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.870774066924715, + "language_loss": 0.91638339, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93973768, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.6222920417785645 + }, + { + "auxiliary_loss_clip": 0.01189638, + "auxiliary_loss_mlp": 0.01178787, + "balance_loss_clip": 1.00274098, + "balance_loss_mlp": 1.00130868, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 10.735466863869362, + "language_loss": 0.78661323, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81029749, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.509437322616577 + }, + { + "auxiliary_loss_clip": 0.01140213, + "auxiliary_loss_mlp": 0.01178663, + "balance_loss_clip": 1.00235963, + "balance_loss_mlp": 1.00127959, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.307866699167963, + "language_loss": 0.69263816, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71582699, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 2.662898063659668 + }, + { + "auxiliary_loss_clip": 0.01123824, + "auxiliary_loss_mlp": 0.01178255, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00125289, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 4.8667625917810335, + "language_loss": 0.71871233, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74173313, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.718013286590576 + }, + { + "auxiliary_loss_clip": 0.01126129, + "auxiliary_loss_mlp": 0.01178247, + "balance_loss_clip": 1.00226438, + "balance_loss_mlp": 1.00124454, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 3.791786227121512, + "language_loss": 0.84333277, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86637652, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.6781864166259766 + }, + { + "auxiliary_loss_clip": 0.01173357, + "auxiliary_loss_mlp": 0.01178262, + "balance_loss_clip": 1.00256896, + "balance_loss_mlp": 1.0009737, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.626780264075696, + "language_loss": 0.82801676, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85153294, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 4.000874280929565 + }, + { + "auxiliary_loss_clip": 0.01140008, + "auxiliary_loss_mlp": 0.01178194, + "balance_loss_clip": 1.00223303, + "balance_loss_mlp": 1.00090623, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 6.300159988225576, + "language_loss": 0.79371762, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81689966, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 4.108298301696777 + }, + { + "auxiliary_loss_clip": 0.01157461, + "auxiliary_loss_mlp": 0.0117835, + "balance_loss_clip": 1.00261712, + "balance_loss_mlp": 1.00125289, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.4329683415670162, + "language_loss": 0.6643374, + "learning_rate": 3.986992513289584e-06, + "loss": 0.6876955, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.6705737113952637 + }, + { + "auxiliary_loss_clip": 0.01157032, + "auxiliary_loss_mlp": 0.01178533, + "balance_loss_clip": 1.00264299, + "balance_loss_mlp": 1.00143576, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 3.393699255662584, + "language_loss": 0.77105498, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79441071, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 4.0244879722595215 + }, + { + "auxiliary_loss_clip": 0.01172822, + "auxiliary_loss_mlp": 0.01178406, + "balance_loss_clip": 1.00260091, + "balance_loss_mlp": 1.00111747, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.4817937799059453, + "language_loss": 0.85060239, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87411463, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 3.944331407546997 + }, + { + "auxiliary_loss_clip": 0.0115624, + "auxiliary_loss_mlp": 0.01178234, + "balance_loss_clip": 1.00253356, + "balance_loss_mlp": 1.00113654, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 12.512118882777877, + "language_loss": 0.7810176, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.8043623, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.6358649730682373 + }, + { + "auxiliary_loss_clip": 0.01156634, + "auxiliary_loss_mlp": 0.01178178, + "balance_loss_clip": 1.00243211, + "balance_loss_mlp": 1.00127196, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 2.484211942471539, + "language_loss": 0.71495539, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73830354, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.601691722869873 + }, + { + "auxiliary_loss_clip": 0.01156834, + "auxiliary_loss_mlp": 0.00749726, + "balance_loss_clip": 1.00241518, + "balance_loss_mlp": 1.0007304, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 2.3175955093794824, + "language_loss": 0.85514605, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87421167, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.6064772605895996 + }, + { + "auxiliary_loss_clip": 0.01189607, + "auxiliary_loss_mlp": 0.01178321, + "balance_loss_clip": 1.00279629, + "balance_loss_mlp": 1.0014149, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 3.6952967627008837, + "language_loss": 0.7179454, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74162471, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.5450692176818848 + }, + { + "auxiliary_loss_clip": 0.01090169, + "auxiliary_loss_mlp": 0.01178224, + "balance_loss_clip": 1.00172317, + "balance_loss_mlp": 1.0012219, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 3.3056547971906936, + "language_loss": 0.8321557, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85483968, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.9006221294403076 + }, + { + "auxiliary_loss_clip": 0.01189217, + "auxiliary_loss_mlp": 0.01178109, + "balance_loss_clip": 1.0024755, + "balance_loss_mlp": 1.0011065, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 3.8725842276002886, + "language_loss": 0.71261501, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73628831, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.8047609329223633 + }, + { + "auxiliary_loss_clip": 0.01156243, + "auxiliary_loss_mlp": 0.011783, + "balance_loss_clip": 1.00249648, + "balance_loss_mlp": 1.0012027, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 3.36072905634526, + "language_loss": 0.8796587, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90300417, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.6602096557617188 + }, + { + "auxiliary_loss_clip": 0.01156568, + "auxiliary_loss_mlp": 0.01178148, + "balance_loss_clip": 1.00248492, + "balance_loss_mlp": 1.00114608, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 2.430788827199771, + "language_loss": 0.81757009, + "learning_rate": 3.986545286538044e-06, + "loss": 0.84091723, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.697272777557373 + }, + { + "auxiliary_loss_clip": 0.01139465, + "auxiliary_loss_mlp": 0.0117808, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00126839, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.2954037508987497, + "language_loss": 0.6997807, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72295612, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.640514612197876 + }, + { + "auxiliary_loss_clip": 0.01172852, + "auxiliary_loss_mlp": 0.01177973, + "balance_loss_clip": 1.00257647, + "balance_loss_mlp": 1.00135231, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 2.9262939015841978, + "language_loss": 0.77412772, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79763591, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.591923475265503 + }, + { + "auxiliary_loss_clip": 0.01189508, + "auxiliary_loss_mlp": 0.01178357, + "balance_loss_clip": 1.00272441, + "balance_loss_mlp": 1.00126016, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 2.3329895649037233, + "language_loss": 0.78642178, + "learning_rate": 3.986409649500203e-06, + "loss": 0.81010044, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.6154279708862305 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01178272, + "balance_loss_clip": 1.00266171, + "balance_loss_mlp": 1.00136495, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 2.332040857901703, + "language_loss": 0.81975532, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84328794, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.5568506717681885 + }, + { + "auxiliary_loss_clip": 0.01173337, + "auxiliary_loss_mlp": 0.01178172, + "balance_loss_clip": 1.00253582, + "balance_loss_mlp": 1.00107467, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 3.2859414511450122, + "language_loss": 0.83037841, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85389352, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.528174877166748 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01178515, + "balance_loss_clip": 1.00258422, + "balance_loss_mlp": 1.00141728, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.2615578017627027, + "language_loss": 0.73379409, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75714225, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.57029390335083 + }, + { + "auxiliary_loss_clip": 0.01173034, + "auxiliary_loss_mlp": 0.0117815, + "balance_loss_clip": 1.00248134, + "balance_loss_mlp": 1.0011481, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 4.501300217787182, + "language_loss": 0.86212951, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88564134, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.521883964538574 + }, + { + "auxiliary_loss_clip": 0.01156409, + "auxiliary_loss_mlp": 0.01177971, + "balance_loss_clip": 1.0025301, + "balance_loss_mlp": 1.00087404, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 6.190634227199203, + "language_loss": 0.82538396, + "learning_rate": 3.98618208129641e-06, + "loss": 0.8487277, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.5574593544006348 + }, + { + "auxiliary_loss_clip": 0.01173019, + "auxiliary_loss_mlp": 0.00749758, + "balance_loss_clip": 1.00272775, + "balance_loss_mlp": 1.0008899, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 2.540041153067429, + "language_loss": 0.82534051, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84456825, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.589012622833252 + }, + { + "auxiliary_loss_clip": 0.01139169, + "auxiliary_loss_mlp": 0.0117815, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.00105262, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 7.241530673622215, + "language_loss": 0.80710614, + "learning_rate": 3.986090526789227e-06, + "loss": 0.83027935, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.68228816986084 + }, + { + "auxiliary_loss_clip": 0.01155951, + "auxiliary_loss_mlp": 0.0117788, + "balance_loss_clip": 1.00244963, + "balance_loss_mlp": 1.00106883, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 5.4791573899530075, + "language_loss": 0.96392262, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98726094, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.5822274684906006 + }, + { + "auxiliary_loss_clip": 0.01172552, + "auxiliary_loss_mlp": 0.01177989, + "balance_loss_clip": 1.0025568, + "balance_loss_mlp": 1.0009867, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 2.8029255339394274, + "language_loss": 0.81809151, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84159696, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.6934757232666016 + }, + { + "auxiliary_loss_clip": 0.01178584, + "auxiliary_loss_mlp": 0.01179209, + "balance_loss_clip": 1.00599575, + "balance_loss_mlp": 1.00373256, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 1.0872576471562543, + "language_loss": 0.56735301, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.59093094, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.100776433944702 + }, + { + "auxiliary_loss_clip": 0.01156762, + "auxiliary_loss_mlp": 0.01177878, + "balance_loss_clip": 1.00244188, + "balance_loss_mlp": 1.00097191, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 3.175260163791295, + "language_loss": 0.72439414, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74774051, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.6061851978302 + }, + { + "auxiliary_loss_clip": 0.01123006, + "auxiliary_loss_mlp": 0.01178451, + "balance_loss_clip": 1.00229716, + "balance_loss_mlp": 1.00144935, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 2.2106368579250524, + "language_loss": 0.78100467, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80401933, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.679182291030884 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01178084, + "balance_loss_clip": 1.00241971, + "balance_loss_mlp": 1.00108242, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.0761176691636205, + "language_loss": 0.71322721, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73640788, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.705862283706665 + }, + { + "auxiliary_loss_clip": 0.01139542, + "auxiliary_loss_mlp": 0.01178272, + "balance_loss_clip": 1.00233865, + "balance_loss_mlp": 1.00127029, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 3.073495501981828, + "language_loss": 0.78264391, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80582201, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.717924118041992 + }, + { + "auxiliary_loss_clip": 0.01140384, + "auxiliary_loss_mlp": 0.01177811, + "balance_loss_clip": 1.00238037, + "balance_loss_mlp": 1.00090432, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 5.009833348057232, + "language_loss": 0.79259002, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81577194, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.6292953491210938 + }, + { + "auxiliary_loss_clip": 0.01124316, + "auxiliary_loss_mlp": 0.01177584, + "balance_loss_clip": 1.00237417, + "balance_loss_mlp": 1.00086856, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 4.45463397448458, + "language_loss": 0.82807958, + "learning_rate": 3.985674803727289e-06, + "loss": 0.8510986, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.6757850646972656 + }, + { + "auxiliary_loss_clip": 0.01124773, + "auxiliary_loss_mlp": 0.0117851, + "balance_loss_clip": 1.00430059, + "balance_loss_mlp": 1.00303447, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8532429057811418, + "language_loss": 0.58091968, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60395253, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.178925037384033 + }, + { + "auxiliary_loss_clip": 0.01155933, + "auxiliary_loss_mlp": 0.01177995, + "balance_loss_clip": 1.00247538, + "balance_loss_mlp": 1.00127864, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.4861202734681163, + "language_loss": 0.91595155, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93929082, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.5768024921417236 + }, + { + "auxiliary_loss_clip": 0.01139716, + "auxiliary_loss_mlp": 0.00749788, + "balance_loss_clip": 1.00244689, + "balance_loss_mlp": 1.00087512, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 1.989925820333772, + "language_loss": 0.87566096, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89455599, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.6936280727386475 + }, + { + "auxiliary_loss_clip": 0.01175786, + "auxiliary_loss_mlp": 0.01178273, + "balance_loss_clip": 1.00568366, + "balance_loss_mlp": 1.00279653, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9300773925653487, + "language_loss": 0.59705651, + "learning_rate": 3.985488080124218e-06, + "loss": 0.62059706, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 3.0667266845703125 + }, + { + "auxiliary_loss_clip": 0.01157035, + "auxiliary_loss_mlp": 0.01177846, + "balance_loss_clip": 1.00251591, + "balance_loss_mlp": 1.00084388, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.237627733981037, + "language_loss": 0.83573389, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85908264, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.5960514545440674 + }, + { + "auxiliary_loss_clip": 0.01155978, + "auxiliary_loss_mlp": 0.01178223, + "balance_loss_clip": 1.00253975, + "balance_loss_mlp": 1.00131679, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 2.3519561582177038, + "language_loss": 0.85125756, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87459958, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 2.637453317642212 + }, + { + "auxiliary_loss_clip": 0.01189067, + "auxiliary_loss_mlp": 0.01178542, + "balance_loss_clip": 1.002707, + "balance_loss_mlp": 1.00144482, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.027121491562029, + "language_loss": 0.78739214, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81106818, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 3.916674852371216 + }, + { + "auxiliary_loss_clip": 0.01174628, + "auxiliary_loss_mlp": 0.01178647, + "balance_loss_clip": 1.00501132, + "balance_loss_mlp": 1.00240767, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7578848894551367, + "language_loss": 0.58300585, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60653859, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 4.5344319343566895 + }, + { + "auxiliary_loss_clip": 0.01123374, + "auxiliary_loss_mlp": 0.01178181, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.00146484, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.1322051502337116, + "language_loss": 0.7154485, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73846412, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.7150323390960693 + }, + { + "auxiliary_loss_clip": 0.01122888, + "auxiliary_loss_mlp": 0.01178058, + "balance_loss_clip": 1.002092, + "balance_loss_mlp": 1.00096059, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 2.032584839407073, + "language_loss": 0.78890204, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81191146, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 5.529223680496216 + }, + { + "auxiliary_loss_clip": 0.01172434, + "auxiliary_loss_mlp": 0.01178087, + "balance_loss_clip": 1.00238514, + "balance_loss_mlp": 1.00108552, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.52728239430157, + "language_loss": 0.71664464, + "learning_rate": 3.985158415226128e-06, + "loss": 0.74014992, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.569408416748047 + }, + { + "auxiliary_loss_clip": 0.01139448, + "auxiliary_loss_mlp": 0.01178632, + "balance_loss_clip": 1.0024482, + "balance_loss_mlp": 1.00172567, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.648987436540211, + "language_loss": 0.81571555, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83889639, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.6615803241729736 + }, + { + "auxiliary_loss_clip": 0.01159045, + "auxiliary_loss_mlp": 0.01178563, + "balance_loss_clip": 1.00572932, + "balance_loss_mlp": 1.00308657, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.795069936617564, + "language_loss": 0.59809864, + "learning_rate": 3.985063547731735e-06, + "loss": 0.62147474, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.171912908554077 + }, + { + "auxiliary_loss_clip": 0.01189114, + "auxiliary_loss_mlp": 0.01178029, + "balance_loss_clip": 1.00279093, + "balance_loss_mlp": 1.00102687, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.723246825662254, + "language_loss": 0.81508267, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83875406, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.5594043731689453 + }, + { + "auxiliary_loss_clip": 0.01140743, + "auxiliary_loss_mlp": 0.01177983, + "balance_loss_clip": 1.00252938, + "balance_loss_mlp": 1.00107682, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.4135860314726565, + "language_loss": 0.75711673, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78030401, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.662986993789673 + }, + { + "auxiliary_loss_clip": 0.01090582, + "auxiliary_loss_mlp": 0.01177995, + "balance_loss_clip": 1.00222385, + "balance_loss_mlp": 1.00099325, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 3.014254900414777, + "language_loss": 0.71745753, + "learning_rate": 3.984920681941094e-06, + "loss": 0.7401433, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 2.9234845638275146 + }, + { + "auxiliary_loss_clip": 0.01123024, + "auxiliary_loss_mlp": 0.01178268, + "balance_loss_clip": 1.00209916, + "balance_loss_mlp": 1.00126576, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 3.400415385637227, + "language_loss": 0.8131904, + "learning_rate": 3.984872909471688e-06, + "loss": 0.83620328, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 2.708982467651367 + }, + { + "auxiliary_loss_clip": 0.01172703, + "auxiliary_loss_mlp": 0.01177932, + "balance_loss_clip": 1.00245619, + "balance_loss_mlp": 1.00131154, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.4075104431270677, + "language_loss": 0.81027585, + "learning_rate": 3.984825061735701e-06, + "loss": 0.8337822, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.5406486988067627 + }, + { + "auxiliary_loss_clip": 0.01155941, + "auxiliary_loss_mlp": 0.01178203, + "balance_loss_clip": 1.0023241, + "balance_loss_mlp": 1.00110531, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.7791435300580807, + "language_loss": 0.63689369, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.66023505, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.859177350997925 + }, + { + "auxiliary_loss_clip": 0.01106913, + "auxiliary_loss_mlp": 0.01177706, + "balance_loss_clip": 1.0021795, + "balance_loss_mlp": 1.00089478, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.956637029377731, + "language_loss": 0.7536478, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77649403, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.6920487880706787 + }, + { + "auxiliary_loss_clip": 0.01139772, + "auxiliary_loss_mlp": 0.00749965, + "balance_loss_clip": 1.00249624, + "balance_loss_mlp": 1.00122118, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0481778590562874, + "language_loss": 0.8731752, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89207256, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.641495943069458 + }, + { + "auxiliary_loss_clip": 0.01156359, + "auxiliary_loss_mlp": 0.00749924, + "balance_loss_clip": 1.00239348, + "balance_loss_mlp": 1.00116086, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.916540103083272, + "language_loss": 0.78590345, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80496627, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.62562894821167 + }, + { + "auxiliary_loss_clip": 0.01156018, + "auxiliary_loss_mlp": 0.01178183, + "balance_loss_clip": 1.00249517, + "balance_loss_mlp": 1.00127649, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.862801164455198, + "language_loss": 0.84274924, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86609113, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.701226234436035 + }, + { + "auxiliary_loss_clip": 0.01140255, + "auxiliary_loss_mlp": 0.01178052, + "balance_loss_clip": 1.00245929, + "balance_loss_mlp": 1.00114584, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.3071775006366293, + "language_loss": 0.78879595, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81197906, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.6520683765411377 + }, + { + "auxiliary_loss_clip": 0.01188934, + "auxiliary_loss_mlp": 0.01177652, + "balance_loss_clip": 1.00267577, + "balance_loss_mlp": 1.00084078, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.469536954242908, + "language_loss": 0.85470545, + "learning_rate": 3.984488020272336e-06, + "loss": 0.8783713, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.579789161682129 + }, + { + "auxiliary_loss_clip": 0.01140196, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_clip": 1.00244462, + "balance_loss_mlp": 1.0010066, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 8.793385549950438, + "language_loss": 0.74830377, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77148294, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.8031482696533203 + }, + { + "auxiliary_loss_clip": 0.01156448, + "auxiliary_loss_mlp": 0.00750009, + "balance_loss_clip": 1.00267458, + "balance_loss_mlp": 1.0013814, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.890674863892193, + "language_loss": 0.6831373, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.7022019, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.6882922649383545 + }, + { + "auxiliary_loss_clip": 0.01172664, + "auxiliary_loss_mlp": 0.0117781, + "balance_loss_clip": 1.00273871, + "balance_loss_mlp": 1.00099874, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 2.5797898792177607, + "language_loss": 0.79174423, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81524897, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.6541876792907715 + }, + { + "auxiliary_loss_clip": 0.01155954, + "auxiliary_loss_mlp": 0.01177939, + "balance_loss_clip": 1.00254726, + "balance_loss_mlp": 1.00103235, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 2.1793003721129973, + "language_loss": 0.69047403, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71381295, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.692072629928589 + }, + { + "auxiliary_loss_clip": 0.01172709, + "auxiliary_loss_mlp": 0.01177645, + "balance_loss_clip": 1.00262332, + "balance_loss_mlp": 1.00112009, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 2.1097315090327187, + "language_loss": 0.74429715, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76780069, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.611337423324585 + }, + { + "auxiliary_loss_clip": 0.01172872, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_clip": 1.00267148, + "balance_loss_mlp": 1.00104403, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 5.122149869044451, + "language_loss": 0.92078674, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94429404, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.541963815689087 + }, + { + "auxiliary_loss_clip": 0.01188961, + "auxiliary_loss_mlp": 0.01177748, + "balance_loss_clip": 1.00271773, + "balance_loss_mlp": 1.00112724, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.8698590311789522, + "language_loss": 0.82954812, + "learning_rate": 3.984147291462285e-06, + "loss": 0.85321516, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.518054723739624 + }, + { + "auxiliary_loss_clip": 0.01188872, + "auxiliary_loss_mlp": 0.01177543, + "balance_loss_clip": 1.00273573, + "balance_loss_mlp": 1.00101733, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.3183371699863944, + "language_loss": 0.85457247, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87823659, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.5308375358581543 + }, + { + "auxiliary_loss_clip": 0.01122531, + "auxiliary_loss_mlp": 0.01177666, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00114059, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.0914029098362765, + "language_loss": 0.85922217, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88222414, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.634620428085327 + }, + { + "auxiliary_loss_clip": 0.01156251, + "auxiliary_loss_mlp": 0.01177613, + "balance_loss_clip": 1.00257194, + "balance_loss_mlp": 1.00089765, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.1167994975025395, + "language_loss": 0.69352049, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.7168591, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.5904147624969482 + }, + { + "auxiliary_loss_clip": 0.01188744, + "auxiliary_loss_mlp": 0.01177054, + "balance_loss_clip": 1.00258148, + "balance_loss_mlp": 1.00062394, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.6982518289059008, + "language_loss": 0.83554351, + "learning_rate": 3.983950933985064e-06, + "loss": 0.85920143, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.5897271633148193 + }, + { + "auxiliary_loss_clip": 0.0115546, + "auxiliary_loss_mlp": 0.01177666, + "balance_loss_clip": 1.00239301, + "balance_loss_mlp": 1.0009501, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 6.756729865413613, + "language_loss": 0.81538975, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83872104, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.5714948177337646 + }, + { + "auxiliary_loss_clip": 0.01188775, + "auxiliary_loss_mlp": 0.01177778, + "balance_loss_clip": 1.00267577, + "balance_loss_mlp": 1.00125265, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 2.1712104331915887, + "language_loss": 0.85917926, + "learning_rate": 3.983852303849291e-06, + "loss": 0.88284481, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.547661304473877 + }, + { + "auxiliary_loss_clip": 0.01172282, + "auxiliary_loss_mlp": 0.01177568, + "balance_loss_clip": 1.00266266, + "balance_loss_mlp": 1.00113845, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.2929705242950877, + "language_loss": 0.90904146, + "learning_rate": 3.983802875938651e-06, + "loss": 0.93253994, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.517289638519287 + }, + { + "auxiliary_loss_clip": 0.0115609, + "auxiliary_loss_mlp": 0.01177246, + "balance_loss_clip": 1.002496, + "balance_loss_mlp": 1.00091195, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.716310264783882, + "language_loss": 0.81912619, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84245956, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.6118314266204834 + }, + { + "auxiliary_loss_clip": 0.01156353, + "auxiliary_loss_mlp": 0.01177941, + "balance_loss_clip": 1.00290024, + "balance_loss_mlp": 1.00160623, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 1.850424610166018, + "language_loss": 0.74841678, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77175975, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.65311861038208 + }, + { + "auxiliary_loss_clip": 0.01172437, + "auxiliary_loss_mlp": 0.00749841, + "balance_loss_clip": 1.00249624, + "balance_loss_mlp": 1.00112712, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.8869899643325627, + "language_loss": 0.71025693, + "learning_rate": 3.98365414085822e-06, + "loss": 0.72947967, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.5964949131011963 + }, + { + "auxiliary_loss_clip": 0.01158002, + "auxiliary_loss_mlp": 0.00749864, + "balance_loss_clip": 1.00255501, + "balance_loss_mlp": 1.00113475, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.2399997761098684, + "language_loss": 0.74940038, + "learning_rate": 3.98360441205484e-06, + "loss": 0.76847905, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 2.6028101444244385 + }, + { + "auxiliary_loss_clip": 0.01155411, + "auxiliary_loss_mlp": 0.01177146, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00090671, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6612954803426851, + "language_loss": 0.71583188, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73915744, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.6450960636138916 + }, + { + "auxiliary_loss_clip": 0.01188633, + "auxiliary_loss_mlp": 0.01177101, + "balance_loss_clip": 1.00254476, + "balance_loss_mlp": 1.00095749, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 2.0910700331682825, + "language_loss": 0.80034673, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82400405, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 3.9712588787078857 + }, + { + "auxiliary_loss_clip": 0.01188838, + "auxiliary_loss_mlp": 0.01177302, + "balance_loss_clip": 1.00281501, + "balance_loss_mlp": 1.00096726, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 2.59623044835953, + "language_loss": 0.81147885, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83514023, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 3.9919233322143555 + }, + { + "auxiliary_loss_clip": 0.01174485, + "auxiliary_loss_mlp": 0.0117737, + "balance_loss_clip": 1.00257969, + "balance_loss_mlp": 1.00103569, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 2.0290267086457856, + "language_loss": 0.76280564, + "learning_rate": 3.983404744675437e-06, + "loss": 0.7863242, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 4.059550046920776 + }, + { + "auxiliary_loss_clip": 0.01155006, + "auxiliary_loss_mlp": 0.01177432, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00119317, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 3.2114833864926173, + "language_loss": 0.82547987, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.84880418, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 4.063161373138428 + }, + { + "auxiliary_loss_clip": 0.01172413, + "auxiliary_loss_mlp": 0.01177328, + "balance_loss_clip": 1.00244498, + "balance_loss_mlp": 1.00108874, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 2.1745568515088562, + "language_loss": 0.79622734, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81972468, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.6006271839141846 + }, + { + "auxiliary_loss_clip": 0.01172025, + "auxiliary_loss_mlp": 0.01177456, + "balance_loss_clip": 1.00246, + "balance_loss_mlp": 1.00102592, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.9602686863558394, + "language_loss": 0.79079735, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81429207, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.5696215629577637 + }, + { + "auxiliary_loss_clip": 0.01122733, + "auxiliary_loss_mlp": 0.01177523, + "balance_loss_clip": 1.00210035, + "balance_loss_mlp": 1.00128424, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.8352224062977085, + "language_loss": 0.73021019, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75321269, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.6728553771972656 + }, + { + "auxiliary_loss_clip": 0.01155853, + "auxiliary_loss_mlp": 0.01177027, + "balance_loss_clip": 1.00230944, + "balance_loss_mlp": 1.00088358, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 2.1467324466108475, + "language_loss": 0.80989027, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83321905, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.651548147201538 + }, + { + "auxiliary_loss_clip": 0.01155852, + "auxiliary_loss_mlp": 0.01177026, + "balance_loss_clip": 1.00224662, + "balance_loss_mlp": 1.00078714, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.447971043357062, + "language_loss": 0.84626442, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86959314, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.6158220767974854 + }, + { + "auxiliary_loss_clip": 0.01171956, + "auxiliary_loss_mlp": 0.01177403, + "balance_loss_clip": 1.00231647, + "balance_loss_mlp": 1.00106835, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.3229106795481416, + "language_loss": 0.90189111, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9253847, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.544438123703003 + }, + { + "auxiliary_loss_clip": 0.01155642, + "auxiliary_loss_mlp": 0.01177586, + "balance_loss_clip": 1.00233507, + "balance_loss_mlp": 1.00106061, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.3374419258022936, + "language_loss": 0.88637674, + "learning_rate": 3.983001799915153e-06, + "loss": 0.90970898, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.6106173992156982 + }, + { + "auxiliary_loss_clip": 0.01188734, + "auxiliary_loss_mlp": 0.01177306, + "balance_loss_clip": 1.00260115, + "balance_loss_mlp": 1.00106668, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.2049227789776564, + "language_loss": 0.84123087, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86489129, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.5452663898468018 + }, + { + "auxiliary_loss_clip": 0.01157725, + "auxiliary_loss_mlp": 0.00749796, + "balance_loss_clip": 1.00235367, + "balance_loss_mlp": 1.00103092, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 2.3512040893812642, + "language_loss": 0.75648361, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77555883, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.577923536300659 + }, + { + "auxiliary_loss_clip": 0.01155621, + "auxiliary_loss_mlp": 0.01177336, + "balance_loss_clip": 1.00216925, + "balance_loss_mlp": 1.00090694, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 2.047394526538312, + "language_loss": 0.8942157, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91754532, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.6201181411743164 + }, + { + "auxiliary_loss_clip": 0.0115588, + "auxiliary_loss_mlp": 0.01177467, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00103724, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.9184235759566584, + "language_loss": 0.82125115, + "learning_rate": 3.982798522778748e-06, + "loss": 0.8445847, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.6293914318084717 + }, + { + "auxiliary_loss_clip": 0.0117259, + "auxiliary_loss_mlp": 0.01177334, + "balance_loss_clip": 1.00252223, + "balance_loss_mlp": 1.00109553, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.885509958148775, + "language_loss": 0.82356918, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84706843, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.515789747238159 + }, + { + "auxiliary_loss_clip": 0.01155507, + "auxiliary_loss_mlp": 0.01177182, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00094318, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.9947566607893876, + "language_loss": 0.85189205, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87521893, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.711554765701294 + }, + { + "auxiliary_loss_clip": 0.01172111, + "auxiliary_loss_mlp": 0.01177941, + "balance_loss_clip": 1.00253534, + "balance_loss_mlp": 1.00132084, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 2.1419377899485554, + "language_loss": 0.83249128, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85599178, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.58365797996521 + }, + { + "auxiliary_loss_clip": 0.01139454, + "auxiliary_loss_mlp": 0.01177416, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00108147, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 3.0008087330658086, + "language_loss": 0.74392831, + "learning_rate": 3.982594042635701e-06, + "loss": 0.767097, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.6164286136627197 + }, + { + "auxiliary_loss_clip": 0.01155644, + "auxiliary_loss_mlp": 0.01177422, + "balance_loss_clip": 1.00231981, + "balance_loss_mlp": 1.00118291, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.1550613933639413, + "language_loss": 0.86217403, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88550472, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.588996171951294 + }, + { + "auxiliary_loss_clip": 0.01156618, + "auxiliary_loss_mlp": 0.01176607, + "balance_loss_clip": 1.00344348, + "balance_loss_mlp": 1.00189435, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8992446427559608, + "language_loss": 0.63249546, + "learning_rate": 3.982491351475427e-06, + "loss": 0.6558277, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.2743172645568848 + }, + { + "auxiliary_loss_clip": 0.01172372, + "auxiliary_loss_mlp": 0.01177333, + "balance_loss_clip": 1.00246644, + "balance_loss_mlp": 1.0010941, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.7230991122014805, + "language_loss": 0.83621293, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85970998, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.5508873462677 + }, + { + "auxiliary_loss_clip": 0.01155797, + "auxiliary_loss_mlp": 0.01177472, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.00113785, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.319211779281094, + "language_loss": 0.88437819, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90771085, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.6051578521728516 + }, + { + "auxiliary_loss_clip": 0.0115614, + "auxiliary_loss_mlp": 0.01177392, + "balance_loss_clip": 1.00267291, + "balance_loss_mlp": 1.0012486, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2719229312880405, + "language_loss": 0.83880001, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.86213541, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.819977283477783 + }, + { + "auxiliary_loss_clip": 0.01172181, + "auxiliary_loss_mlp": 0.0117713, + "balance_loss_clip": 1.00254798, + "balance_loss_mlp": 1.00098681, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 3.318906993468535, + "language_loss": 0.80084956, + "learning_rate": 3.982285067055262e-06, + "loss": 0.82434267, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.568509578704834 + }, + { + "auxiliary_loss_clip": 0.01188525, + "auxiliary_loss_mlp": 0.01177523, + "balance_loss_clip": 1.00239086, + "balance_loss_mlp": 1.00090218, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.46901223544317, + "language_loss": 0.79849756, + "learning_rate": 3.982233308024204e-06, + "loss": 0.82215798, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.5925066471099854 + }, + { + "auxiliary_loss_clip": 0.0112193, + "auxiliary_loss_mlp": 0.01177314, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00117052, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.099098578853854, + "language_loss": 0.77134371, + "learning_rate": 3.98218147382666e-06, + "loss": 0.7943362, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.6483614444732666 + }, + { + "auxiliary_loss_clip": 0.01188666, + "auxiliary_loss_mlp": 0.01177425, + "balance_loss_clip": 1.00253034, + "balance_loss_mlp": 1.00128114, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.694429339736079, + "language_loss": 0.65773642, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68139744, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.4891393184661865 + }, + { + "auxiliary_loss_clip": 0.01171695, + "auxiliary_loss_mlp": 0.01177097, + "balance_loss_clip": 1.00230646, + "balance_loss_mlp": 1.0009532, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 1.9912785773552941, + "language_loss": 0.69639277, + "learning_rate": 3.98207757993998e-06, + "loss": 0.7198807, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.5858001708984375 + }, + { + "auxiliary_loss_clip": 0.01139257, + "auxiliary_loss_mlp": 0.01177222, + "balance_loss_clip": 1.00221491, + "balance_loss_mlp": 1.00126874, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 4.705666653966287, + "language_loss": 0.78955042, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81271523, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.5974323749542236 + }, + { + "auxiliary_loss_clip": 0.01188529, + "auxiliary_loss_mlp": 0.01177275, + "balance_loss_clip": 1.00255752, + "balance_loss_mlp": 1.00103617, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0913746946899576, + "language_loss": 0.850088, + "learning_rate": 3.981973385410981e-06, + "loss": 0.8737461, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.5156033039093018 + }, + { + "auxiliary_loss_clip": 0.01139472, + "auxiliary_loss_mlp": 0.00749825, + "balance_loss_clip": 1.00230467, + "balance_loss_mlp": 1.00108981, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 3.1834886497818133, + "language_loss": 0.76671147, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78560442, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.01188604, + "auxiliary_loss_mlp": 0.01177652, + "balance_loss_clip": 1.00251257, + "balance_loss_mlp": 1.00141335, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3632525319467907, + "language_loss": 0.75765365, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78131628, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.502004623413086 + }, + { + "auxiliary_loss_clip": 0.01140218, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_clip": 1.00229025, + "balance_loss_mlp": 1.00099564, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.9730581840921, + "language_loss": 0.74617761, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76935315, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.5899274349212646 + }, + { + "auxiliary_loss_clip": 0.01188413, + "auxiliary_loss_mlp": 0.01176927, + "balance_loss_clip": 1.00240254, + "balance_loss_mlp": 1.00078368, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.843177425479984, + "language_loss": 0.78403866, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80769205, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.509742259979248 + }, + { + "auxiliary_loss_clip": 0.01155268, + "auxiliary_loss_mlp": 0.01177086, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.00084686, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9924838391579776, + "language_loss": 0.85754931, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88087279, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.01172421, + "auxiliary_loss_mlp": 0.01177277, + "balance_loss_clip": 1.00244498, + "balance_loss_mlp": 1.0013243, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 2.354574596787924, + "language_loss": 0.81661797, + "learning_rate": 3.981658998128341e-06, + "loss": 0.84011495, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.5676286220550537 + }, + { + "auxiliary_loss_clip": 0.01156172, + "auxiliary_loss_mlp": 0.01176938, + "balance_loss_clip": 1.00242555, + "balance_loss_mlp": 1.00117588, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.4044570169318473, + "language_loss": 0.80181092, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82514203, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.5901434421539307 + }, + { + "auxiliary_loss_clip": 0.01139633, + "auxiliary_loss_mlp": 0.00749803, + "balance_loss_clip": 1.00221598, + "balance_loss_mlp": 1.00099039, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.589518856266038, + "language_loss": 0.71023941, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.72913384, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 4.171745777130127 + }, + { + "auxiliary_loss_clip": 0.01122258, + "auxiliary_loss_mlp": 0.01177, + "balance_loss_clip": 1.0019573, + "balance_loss_mlp": 1.00104666, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.5780370054523467, + "language_loss": 0.86446857, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88746119, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 5.5384721755981445 + }, + { + "auxiliary_loss_clip": 0.0112263, + "auxiliary_loss_mlp": 0.0117733, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00109053, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.6703625419134647, + "language_loss": 0.84009612, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8630957, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.6662943363189697 + }, + { + "auxiliary_loss_clip": 0.0118861, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_clip": 1.00259709, + "balance_loss_mlp": 1.00092053, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.258602299056888, + "language_loss": 0.76674801, + "learning_rate": 3.981394942228581e-06, + "loss": 0.79040468, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 3.9550437927246094 + }, + { + "auxiliary_loss_clip": 0.01172107, + "auxiliary_loss_mlp": 0.01177623, + "balance_loss_clip": 1.0026511, + "balance_loss_mlp": 1.00167012, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.0127110898319716, + "language_loss": 0.82540989, + "learning_rate": 3.98134190563652e-06, + "loss": 0.84890723, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.5877201557159424 + }, + { + "auxiliary_loss_clip": 0.01171915, + "auxiliary_loss_mlp": 0.01177168, + "balance_loss_clip": 1.00239229, + "balance_loss_mlp": 1.00131035, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 3.2613046097779903, + "language_loss": 0.69102335, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7145142, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.53309965133667 + }, + { + "auxiliary_loss_clip": 0.01155079, + "auxiliary_loss_mlp": 0.00749752, + "balance_loss_clip": 1.00221014, + "balance_loss_mlp": 1.00102639, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 2.153150409357856, + "language_loss": 0.87963963, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89868796, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.5699191093444824 + }, + { + "auxiliary_loss_clip": 0.0112248, + "auxiliary_loss_mlp": 0.01177004, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00114632, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 3.1725370718439847, + "language_loss": 0.78571355, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80870837, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.651120662689209 + }, + { + "auxiliary_loss_clip": 0.01171499, + "auxiliary_loss_mlp": 0.01176939, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00117683, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.6130194699767728, + "language_loss": 0.82392424, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84740865, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.630723714828491 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.00749722, + "balance_loss_clip": 1.00257015, + "balance_loss_mlp": 1.00098133, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.7987118212037796, + "language_loss": 0.76626503, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78531736, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.6418941020965576 + }, + { + "auxiliary_loss_clip": 0.0117194, + "auxiliary_loss_mlp": 0.01176895, + "balance_loss_clip": 1.00256634, + "balance_loss_mlp": 1.00113237, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 2.2228496267875633, + "language_loss": 0.77434236, + "learning_rate": 3.981022108368387e-06, + "loss": 0.7978307, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.5466668605804443 + }, + { + "auxiliary_loss_clip": 0.0117235, + "auxiliary_loss_mlp": 0.01176719, + "balance_loss_clip": 1.00234866, + "balance_loss_mlp": 1.00105166, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 1.8935877651467865, + "language_loss": 0.79709041, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.82058108, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.588144063949585 + }, + { + "auxiliary_loss_clip": 0.01172238, + "auxiliary_loss_mlp": 0.01176741, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.00107431, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 2.1106572720532237, + "language_loss": 0.79002082, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81351066, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.565746545791626 + }, + { + "auxiliary_loss_clip": 0.01171806, + "auxiliary_loss_mlp": 0.01176742, + "balance_loss_clip": 1.00238228, + "balance_loss_mlp": 1.00107527, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.4237186312805408, + "language_loss": 0.81053817, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83402371, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.587597370147705 + }, + { + "auxiliary_loss_clip": 0.01155113, + "auxiliary_loss_mlp": 0.01176685, + "balance_loss_clip": 1.00233686, + "balance_loss_mlp": 1.00120866, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 1.808703865019664, + "language_loss": 0.84384871, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86716676, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.6306283473968506 + }, + { + "auxiliary_loss_clip": 0.01156058, + "auxiliary_loss_mlp": 0.01176612, + "balance_loss_clip": 1.00234795, + "balance_loss_mlp": 1.00094473, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 3.029216942902623, + "language_loss": 0.90583897, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9291656, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.5397214889526367 + }, + { + "auxiliary_loss_clip": 0.01188258, + "auxiliary_loss_mlp": 0.01176761, + "balance_loss_clip": 1.00248802, + "balance_loss_mlp": 1.0010941, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 6.125489450546932, + "language_loss": 0.72758746, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.75123763, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.510134696960449 + }, + { + "auxiliary_loss_clip": 0.01122031, + "auxiliary_loss_mlp": 0.01176091, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00080538, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 2.0986483355866277, + "language_loss": 0.84574044, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86872166, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.6896684169769287 + }, + { + "auxiliary_loss_clip": 0.0118802, + "auxiliary_loss_mlp": 0.01176656, + "balance_loss_clip": 1.00229323, + "balance_loss_mlp": 1.00108469, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.66070568035988, + "language_loss": 0.845613, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86925977, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.551290512084961 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.0117666, + "balance_loss_clip": 1.00235856, + "balance_loss_mlp": 1.00118399, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 4.594949167398026, + "language_loss": 0.81709731, + "learning_rate": 3.980537341966595e-06, + "loss": 0.84028596, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.7457704544067383 + }, + { + "auxiliary_loss_clip": 0.01156183, + "auxiliary_loss_mlp": 0.01176734, + "balance_loss_clip": 1.00257373, + "balance_loss_mlp": 1.00116277, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.56553535882899, + "language_loss": 0.76342022, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78674942, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.6491127014160156 + }, + { + "auxiliary_loss_clip": 0.01138706, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_clip": 1.00215268, + "balance_loss_mlp": 1.00110054, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.650559151230144, + "language_loss": 0.86828732, + "learning_rate": 3.98042878992303e-06, + "loss": 0.89144206, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.614664316177368 + }, + { + "auxiliary_loss_clip": 0.01171543, + "auxiliary_loss_mlp": 0.01176825, + "balance_loss_clip": 1.00227809, + "balance_loss_mlp": 1.00125384, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.9000527725835696, + "language_loss": 0.87056816, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.568547487258911 + }, + { + "auxiliary_loss_clip": 0.01187952, + "auxiliary_loss_mlp": 0.01176379, + "balance_loss_clip": 1.0023334, + "balance_loss_mlp": 1.00109375, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.0936255789115403, + "language_loss": 0.84609401, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86973739, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.5125458240509033 + }, + { + "auxiliary_loss_clip": 0.01139745, + "auxiliary_loss_mlp": 0.01176765, + "balance_loss_clip": 1.00211585, + "balance_loss_mlp": 1.00119364, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 4.123489839451218, + "language_loss": 0.77743077, + "learning_rate": 3.98026539862741e-06, + "loss": 0.80059588, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.6192421913146973 + }, + { + "auxiliary_loss_clip": 0.01122341, + "auxiliary_loss_mlp": 0.01176531, + "balance_loss_clip": 1.00216126, + "balance_loss_mlp": 1.00115061, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 2.6459512852627367, + "language_loss": 0.92489076, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94787949, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.6742382049560547 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.01176593, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00102162, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 3.4769431889960813, + "language_loss": 0.91188347, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9347086, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.8111209869384766 + }, + { + "auxiliary_loss_clip": 0.01188197, + "auxiliary_loss_mlp": 0.01176654, + "balance_loss_clip": 1.00257087, + "balance_loss_mlp": 1.00127292, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.028433824662708, + "language_loss": 0.82278669, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84643519, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.5556445121765137 + }, + { + "auxiliary_loss_clip": 0.01187995, + "auxiliary_loss_mlp": 0.01176233, + "balance_loss_clip": 1.00231314, + "balance_loss_mlp": 1.00094771, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.526544640915614, + "language_loss": 0.83420384, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8578462, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.5195651054382324 + }, + { + "auxiliary_loss_clip": 0.01139096, + "auxiliary_loss_mlp": 0.01176674, + "balance_loss_clip": 1.00217819, + "balance_loss_mlp": 1.00100756, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.565326874636449, + "language_loss": 0.90341049, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92656815, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.6111912727355957 + }, + { + "auxiliary_loss_clip": 0.01188114, + "auxiliary_loss_mlp": 0.01176689, + "balance_loss_clip": 1.00235939, + "balance_loss_mlp": 1.00092721, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 6.211713142770978, + "language_loss": 0.76356834, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78721642, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.475266218185425 + }, + { + "auxiliary_loss_clip": 0.01171413, + "auxiliary_loss_mlp": 0.01175962, + "balance_loss_clip": 1.00229251, + "balance_loss_mlp": 1.00086713, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.74214578677315, + "language_loss": 0.85752183, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.88099563, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.606651782989502 + }, + { + "auxiliary_loss_clip": 0.0117153, + "auxiliary_loss_mlp": 0.01176449, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00097251, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.452567262239367, + "language_loss": 0.79193383, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81541359, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.5566580295562744 + }, + { + "auxiliary_loss_clip": 0.01171699, + "auxiliary_loss_mlp": 0.00749617, + "balance_loss_clip": 1.00239241, + "balance_loss_mlp": 1.00083137, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.2910820940835896, + "language_loss": 0.7831865, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80239964, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.5879714488983154 + }, + { + "auxiliary_loss_clip": 0.01188129, + "auxiliary_loss_mlp": 0.01176288, + "balance_loss_clip": 1.00248551, + "balance_loss_mlp": 1.00090766, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.7459938124290673, + "language_loss": 0.81721085, + "learning_rate": 3.979715880319372e-06, + "loss": 0.840855, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.504448652267456 + }, + { + "auxiliary_loss_clip": 0.01157419, + "auxiliary_loss_mlp": 0.01176741, + "balance_loss_clip": 1.00216138, + "balance_loss_mlp": 1.00136018, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.2394995664084774, + "language_loss": 0.95212889, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97547054, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.5987155437469482 + }, + { + "auxiliary_loss_clip": 0.01171352, + "auxiliary_loss_mlp": 0.01176526, + "balance_loss_clip": 1.00230312, + "balance_loss_mlp": 1.00143147, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.9822574998066982, + "language_loss": 0.81043231, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83391112, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.608098268508911 + }, + { + "auxiliary_loss_clip": 0.01188093, + "auxiliary_loss_mlp": 0.01176534, + "balance_loss_clip": 1.00239694, + "balance_loss_mlp": 1.0009625, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.4673812483600113, + "language_loss": 0.7042731, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72791934, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.657914876937866 + }, + { + "auxiliary_loss_clip": 0.0115576, + "auxiliary_loss_mlp": 0.01176312, + "balance_loss_clip": 1.00234306, + "balance_loss_mlp": 1.00093102, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1275839190958115, + "language_loss": 0.77088648, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79420722, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 5.514588356018066 + }, + { + "auxiliary_loss_clip": 0.01187834, + "auxiliary_loss_mlp": 0.01176435, + "balance_loss_clip": 1.00232172, + "balance_loss_mlp": 1.00105417, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 3.347758059679695, + "language_loss": 0.8308621, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85450476, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 3.9845094680786133 + }, + { + "auxiliary_loss_clip": 0.01139353, + "auxiliary_loss_mlp": 0.00749595, + "balance_loss_clip": 1.00223947, + "balance_loss_mlp": 1.00072491, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 3.401901108894725, + "language_loss": 0.75957268, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77846217, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.6777663230895996 + }, + { + "auxiliary_loss_clip": 0.01121808, + "auxiliary_loss_mlp": 0.00749654, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00081289, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.7559426939003313, + "language_loss": 0.77689207, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79560673, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 4.128470420837402 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01176813, + "balance_loss_clip": 1.00233078, + "balance_loss_mlp": 1.00124192, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.2245892671194025, + "language_loss": 0.86478615, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88810259, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.602581739425659 + }, + { + "auxiliary_loss_clip": 0.01138985, + "auxiliary_loss_mlp": 0.01176253, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00077665, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.483837141245245, + "language_loss": 0.89222687, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91537929, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.6434872150421143 + }, + { + "auxiliary_loss_clip": 0.01155456, + "auxiliary_loss_mlp": 0.01176765, + "balance_loss_clip": 1.00265384, + "balance_loss_mlp": 1.00119352, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.01285712088292, + "language_loss": 0.89026999, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91359222, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.612433910369873 + }, + { + "auxiliary_loss_clip": 0.01154355, + "auxiliary_loss_mlp": 0.01176154, + "balance_loss_clip": 1.00253868, + "balance_loss_mlp": 1.00220394, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9025540876104727, + "language_loss": 0.63070625, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65401137, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.2026169300079346 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01176838, + "balance_loss_clip": 1.00229979, + "balance_loss_mlp": 1.00117135, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 3.475231581167617, + "language_loss": 0.63073331, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65389395, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.67038893699646 + }, + { + "auxiliary_loss_clip": 0.01171687, + "auxiliary_loss_mlp": 0.01176173, + "balance_loss_clip": 1.0023911, + "balance_loss_mlp": 1.0011735, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 5.1052528813665985, + "language_loss": 0.76610196, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78958058, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.591543436050415 + }, + { + "auxiliary_loss_clip": 0.01154854, + "auxiliary_loss_mlp": 0.00749593, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00069475, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.303513055925117, + "language_loss": 0.69514292, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71418738, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.5446999073028564 + }, + { + "auxiliary_loss_clip": 0.0118793, + "auxiliary_loss_mlp": 0.01176221, + "balance_loss_clip": 1.0023303, + "balance_loss_mlp": 1.00093544, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9021864468121417, + "language_loss": 0.88502777, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90866923, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.5372684001922607 + }, + { + "auxiliary_loss_clip": 0.01188053, + "auxiliary_loss_mlp": 0.0117704, + "balance_loss_clip": 1.00230896, + "balance_loss_mlp": 1.00137353, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 3.1035793890236074, + "language_loss": 0.88052762, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.9041785, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.467134714126587 + }, + { + "auxiliary_loss_clip": 0.01155069, + "auxiliary_loss_mlp": 0.01176699, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00141382, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 5.065910088239258, + "language_loss": 0.65794575, + "learning_rate": 3.978764471530921e-06, + "loss": 0.68126339, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.557111978530884 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.00749557, + "balance_loss_clip": 1.00233293, + "balance_loss_mlp": 1.00076306, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.915863509952987, + "language_loss": 0.74364161, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76285553, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.498732566833496 + }, + { + "auxiliary_loss_clip": 0.01155442, + "auxiliary_loss_mlp": 0.01176644, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00126386, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 3.3609727662240703, + "language_loss": 0.81756639, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84088719, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.59419584274292 + }, + { + "auxiliary_loss_clip": 0.01123324, + "auxiliary_loss_mlp": 0.01176462, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00127172, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.3894111270853964, + "language_loss": 0.67364252, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69664037, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.719740152359009 + }, + { + "auxiliary_loss_clip": 0.01139049, + "auxiliary_loss_mlp": 0.01175649, + "balance_loss_clip": 1.00268626, + "balance_loss_mlp": 1.00169873, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9078526443650317, + "language_loss": 0.7033509, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72649789, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.2347304821014404 + }, + { + "auxiliary_loss_clip": 0.01187844, + "auxiliary_loss_mlp": 0.01176421, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00123143, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.7362010043850855, + "language_loss": 0.80239439, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82603705, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.5123090744018555 + }, + { + "auxiliary_loss_clip": 0.01138805, + "auxiliary_loss_mlp": 0.01176342, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.0011518, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 4.167686325585894, + "language_loss": 0.93533611, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95848751, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.58648943901062 + }, + { + "auxiliary_loss_clip": 0.01155144, + "auxiliary_loss_mlp": 0.01176837, + "balance_loss_clip": 1.00239706, + "balance_loss_mlp": 1.0014565, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.962437166949926, + "language_loss": 0.88157964, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90489942, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.639441967010498 + }, + { + "auxiliary_loss_clip": 0.01187765, + "auxiliary_loss_mlp": 0.01176037, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.00113344, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 2.7268132634960027, + "language_loss": 0.79598558, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81962359, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.480908155441284 + }, + { + "auxiliary_loss_clip": 0.01155913, + "auxiliary_loss_mlp": 0.01173669, + "balance_loss_clip": 1.00302231, + "balance_loss_mlp": 1.0004822, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7869912333762389, + "language_loss": 0.57959616, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60289204, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.232792854309082 + }, + { + "auxiliary_loss_clip": 0.01122236, + "auxiliary_loss_mlp": 0.01176717, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00133622, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.294824051394129, + "language_loss": 0.89821565, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92120516, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.658564805984497 + }, + { + "auxiliary_loss_clip": 0.01155054, + "auxiliary_loss_mlp": 0.01176016, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.00101662, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 15.713857405197444, + "language_loss": 0.81460238, + "learning_rate": 3.978137298044741e-06, + "loss": 0.8379131, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.5556159019470215 + }, + { + "auxiliary_loss_clip": 0.01171755, + "auxiliary_loss_mlp": 0.01176328, + "balance_loss_clip": 1.00232172, + "balance_loss_mlp": 1.00132871, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.928097524508203, + "language_loss": 0.76137459, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78485548, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.566739797592163 + }, + { + "auxiliary_loss_clip": 0.01140996, + "auxiliary_loss_mlp": 0.01176516, + "balance_loss_clip": 1.00199866, + "balance_loss_mlp": 1.00132561, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.9360697609298603, + "language_loss": 0.85101449, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87418962, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.599071502685547 + }, + { + "auxiliary_loss_clip": 0.01187762, + "auxiliary_loss_mlp": 0.01176546, + "balance_loss_clip": 1.00233996, + "balance_loss_mlp": 1.00154698, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 1.9147922185214883, + "language_loss": 0.82922828, + "learning_rate": 3.977964675374399e-06, + "loss": 0.85287136, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.5901706218719482 + }, + { + "auxiliary_loss_clip": 0.01187769, + "auxiliary_loss_mlp": 0.01176467, + "balance_loss_clip": 1.00228143, + "balance_loss_mlp": 1.00146747, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.9489893247342835, + "language_loss": 0.82933998, + "learning_rate": 3.977906984472136e-06, + "loss": 0.85298234, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.519212245941162 + }, + { + "auxiliary_loss_clip": 0.0112197, + "auxiliary_loss_mlp": 0.0117606, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00106084, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 5.461708770383505, + "language_loss": 0.7616213, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78460163, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.6702427864074707 + }, + { + "auxiliary_loss_clip": 0.01157328, + "auxiliary_loss_mlp": 0.01176567, + "balance_loss_clip": 1.00230312, + "balance_loss_mlp": 1.00137699, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.078456303923348, + "language_loss": 0.80744618, + "learning_rate": 3.977791377662507e-06, + "loss": 0.8307851, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.5498735904693604 + }, + { + "auxiliary_loss_clip": 0.01139004, + "auxiliary_loss_mlp": 0.01176552, + "balance_loss_clip": 1.00206494, + "balance_loss_mlp": 1.00145757, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.8407052987466974, + "language_loss": 0.65896165, + "learning_rate": 3.977733461759524e-06, + "loss": 0.68211722, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.6391477584838867 + }, + { + "auxiliary_loss_clip": 0.01137881, + "auxiliary_loss_mlp": 0.01176207, + "balance_loss_clip": 1.00191331, + "balance_loss_mlp": 1.00120735, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2863027881163553, + "language_loss": 0.79401714, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81715798, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.6147196292877197 + }, + { + "auxiliary_loss_clip": 0.01154809, + "auxiliary_loss_mlp": 0.01176307, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00130773, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.7632107649030244, + "language_loss": 0.72926676, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75257796, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.5521578788757324 + }, + { + "auxiliary_loss_clip": 0.01170724, + "auxiliary_loss_mlp": 0.01176344, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00163102, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 3.3991072133533806, + "language_loss": 0.82345533, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84692597, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.5294110774993896 + }, + { + "auxiliary_loss_clip": 0.0117111, + "auxiliary_loss_mlp": 0.01176701, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00160682, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.7416099311944437, + "language_loss": 0.88949621, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91297436, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.5574166774749756 + }, + { + "auxiliary_loss_clip": 0.01171195, + "auxiliary_loss_mlp": 0.01176671, + "balance_loss_clip": 1.00223935, + "balance_loss_mlp": 1.00138593, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.05215268518274, + "language_loss": 0.71616113, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73963988, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.585361957550049 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.0117666, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00175643, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.764373839021364, + "language_loss": 0.82788002, + "learning_rate": 3.977384391505823e-06, + "loss": 0.8510232, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 4.192650318145752 + }, + { + "auxiliary_loss_clip": 0.01155182, + "auxiliary_loss_mlp": 0.00749553, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00057554, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 1.9758603170396896, + "language_loss": 0.80053431, + "learning_rate": 3.977325950678162e-06, + "loss": 0.81958163, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 4.025975942611694 + }, + { + "auxiliary_loss_clip": 0.01154525, + "auxiliary_loss_mlp": 0.01176592, + "balance_loss_clip": 1.00210559, + "balance_loss_mlp": 1.00159311, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 4.12890255055033, + "language_loss": 0.81393045, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83724159, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.590364933013916 + }, + { + "auxiliary_loss_clip": 0.01154505, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_clip": 1.00215995, + "balance_loss_mlp": 1.00157869, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.027703909782286, + "language_loss": 0.73114479, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75445461, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 5.515759706497192 + }, + { + "auxiliary_loss_clip": 0.01187763, + "auxiliary_loss_mlp": 0.01176488, + "balance_loss_clip": 1.00226474, + "balance_loss_mlp": 1.00158453, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.14829007457079, + "language_loss": 0.79990613, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.82354861, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.4782066345214844 + }, + { + "auxiliary_loss_clip": 0.01173719, + "auxiliary_loss_mlp": 0.01176002, + "balance_loss_clip": 1.0023284, + "balance_loss_mlp": 1.00119376, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.2683093221056603, + "language_loss": 0.59585118, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61934841, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.5803232192993164 + }, + { + "auxiliary_loss_clip": 0.0117112, + "auxiliary_loss_mlp": 0.01176457, + "balance_loss_clip": 1.0022223, + "balance_loss_mlp": 1.00136256, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 3.2580150780242527, + "language_loss": 0.74677861, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77025437, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.5136656761169434 + }, + { + "auxiliary_loss_clip": 0.01155533, + "auxiliary_loss_mlp": 0.0117617, + "balance_loss_clip": 1.00237441, + "balance_loss_mlp": 1.00117087, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 2.2981658246288568, + "language_loss": 0.88556671, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90888369, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.5941200256347656 + }, + { + "auxiliary_loss_clip": 0.01155555, + "auxiliary_loss_mlp": 0.01175895, + "balance_loss_clip": 1.00221252, + "balance_loss_mlp": 1.00108647, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 2.6632794396203545, + "language_loss": 0.82838511, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85169959, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.570591688156128 + }, + { + "auxiliary_loss_clip": 0.01170967, + "auxiliary_loss_mlp": 0.01176084, + "balance_loss_clip": 1.00222468, + "balance_loss_mlp": 1.0012753, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 3.5439818948846207, + "language_loss": 0.75847071, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78194118, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.527092933654785 + }, + { + "auxiliary_loss_clip": 0.01137612, + "auxiliary_loss_mlp": 0.01176487, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00148797, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 2.507673217463609, + "language_loss": 0.75249231, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77563334, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.6693406105041504 + }, + { + "auxiliary_loss_clip": 0.01187698, + "auxiliary_loss_mlp": 0.01176353, + "balance_loss_clip": 1.0023551, + "balance_loss_mlp": 1.00154448, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0264693734337076, + "language_loss": 0.84258413, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86622465, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.5276803970336914 + }, + { + "auxiliary_loss_clip": 0.01171015, + "auxiliary_loss_mlp": 0.01176198, + "balance_loss_clip": 1.0022366, + "balance_loss_mlp": 1.00148511, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.1113438683409123, + "language_loss": 0.75175869, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77523082, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.549455404281616 + }, + { + "auxiliary_loss_clip": 0.01171816, + "auxiliary_loss_mlp": 0.01176216, + "balance_loss_clip": 1.00222254, + "balance_loss_mlp": 1.00150251, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 2.134196679946556, + "language_loss": 0.76262558, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78610587, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.7513694763183594 + }, + { + "auxiliary_loss_clip": 0.01187664, + "auxiliary_loss_mlp": 0.01176299, + "balance_loss_clip": 1.00237548, + "balance_loss_mlp": 1.00149012, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 3.253249272524959, + "language_loss": 0.84116822, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86480784, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.533146381378174 + }, + { + "auxiliary_loss_clip": 0.01154569, + "auxiliary_loss_mlp": 0.01176372, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00127733, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.4206385381921653, + "language_loss": 0.77029693, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79360628, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01176055, + "balance_loss_clip": 1.00214088, + "balance_loss_mlp": 1.00124609, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 2.2055011001814377, + "language_loss": 0.84820223, + "learning_rate": 3.976440341863237e-06, + "loss": 0.87151098, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.6608355045318604 + }, + { + "auxiliary_loss_clip": 0.01187699, + "auxiliary_loss_mlp": 0.01176261, + "balance_loss_clip": 1.00227261, + "balance_loss_mlp": 1.00135756, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 3.1634294535203926, + "language_loss": 0.85765088, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88129044, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.481457471847534 + }, + { + "auxiliary_loss_clip": 0.01187687, + "auxiliary_loss_mlp": 0.01176033, + "balance_loss_clip": 1.0022819, + "balance_loss_mlp": 1.00103426, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.7057882662690875, + "language_loss": 0.85686517, + "learning_rate": 3.976320986426344e-06, + "loss": 0.88050234, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.536926031112671 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01176178, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00117826, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 3.776095044857639, + "language_loss": 0.91289592, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93620068, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.5637807846069336 + }, + { + "auxiliary_loss_clip": 0.01155671, + "auxiliary_loss_mlp": 0.0117314, + "balance_loss_clip": 1.00276613, + "balance_loss_mlp": 0.99995273, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8817292889719001, + "language_loss": 0.65038764, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67367578, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.264636278152466 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01176022, + "balance_loss_clip": 1.00223494, + "balance_loss_mlp": 1.00111842, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 7.2694182032534265, + "language_loss": 0.87802339, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90149379, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.6242971420288086 + }, + { + "auxiliary_loss_clip": 0.01106327, + "auxiliary_loss_mlp": 0.01176576, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00157666, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.0955635552594716, + "language_loss": 0.85293818, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87576723, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.7402255535125732 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.0117613, + "balance_loss_clip": 1.00228691, + "balance_loss_mlp": 1.00103569, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.0490872885971982, + "language_loss": 0.79012817, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81328022, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.6061418056488037 + }, + { + "auxiliary_loss_clip": 0.0113825, + "auxiliary_loss_mlp": 0.0117619, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00109529, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.3113420494648325, + "language_loss": 0.87949276, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90263706, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.6335158348083496 + }, + { + "auxiliary_loss_clip": 0.01187665, + "auxiliary_loss_mlp": 0.01175885, + "balance_loss_clip": 1.00233269, + "balance_loss_mlp": 1.00126731, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.7885660821183644, + "language_loss": 0.96282834, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98646379, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.4978013038635254 + }, + { + "auxiliary_loss_clip": 0.01154967, + "auxiliary_loss_mlp": 0.0117599, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00127649, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.4666396216344624, + "language_loss": 0.76697016, + "learning_rate": 3.97584056716893e-06, + "loss": 0.79027975, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.6167185306549072 + }, + { + "auxiliary_loss_clip": 0.01122253, + "auxiliary_loss_mlp": 0.00749539, + "balance_loss_clip": 1.0021894, + "balance_loss_mlp": 1.0005548, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 2.4518669351368656, + "language_loss": 0.80840516, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82712305, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.6850175857543945 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01175997, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00118887, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 3.9854720718010666, + "language_loss": 0.86793786, + "learning_rate": 3.975719713068202e-06, + "loss": 0.89107418, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.684561014175415 + }, + { + "auxiliary_loss_clip": 0.01187604, + "auxiliary_loss_mlp": 0.01175897, + "balance_loss_clip": 1.00232327, + "balance_loss_mlp": 1.00099301, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.9244095533571577, + "language_loss": 0.71808779, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74172282, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.684378147125244 + }, + { + "auxiliary_loss_clip": 0.01171486, + "auxiliary_loss_mlp": 0.01176774, + "balance_loss_clip": 1.00226998, + "balance_loss_mlp": 1.00177503, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 2.7354870229339983, + "language_loss": 0.70888293, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73236555, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.720141887664795 + }, + { + "auxiliary_loss_clip": 0.01155628, + "auxiliary_loss_mlp": 0.00749566, + "balance_loss_clip": 1.0025636, + "balance_loss_mlp": 1.00055313, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.0623557042805154, + "language_loss": 0.82045758, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.83950949, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.596848249435425 + }, + { + "auxiliary_loss_clip": 0.01170976, + "auxiliary_loss_mlp": 0.01176013, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00139487, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 3.7722311733837346, + "language_loss": 0.7517429, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77521282, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.525405168533325 + }, + { + "auxiliary_loss_clip": 0.01187542, + "auxiliary_loss_mlp": 0.011762, + "balance_loss_clip": 1.00225866, + "balance_loss_mlp": 1.00158191, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 4.057170278869726, + "language_loss": 0.76418006, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78781748, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.500598669052124 + }, + { + "auxiliary_loss_clip": 0.01104305, + "auxiliary_loss_mlp": 0.01176423, + "balance_loss_clip": 1.00172162, + "balance_loss_mlp": 1.00151896, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.7918980859528262, + "language_loss": 0.85315406, + "learning_rate": 3.975355352771841e-06, + "loss": 0.8759613, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.711984872817993 + }, + { + "auxiliary_loss_clip": 0.01170887, + "auxiliary_loss_mlp": 0.01175605, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00108266, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 2.85122341280114, + "language_loss": 0.91051459, + "learning_rate": 3.975294363872468e-06, + "loss": 0.93397951, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 2.5553951263427734 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01175892, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00108349, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 2.290714620817098, + "language_loss": 0.83427757, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85725343, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.6552956104278564 + }, + { + "auxiliary_loss_clip": 0.01141503, + "auxiliary_loss_mlp": 0.01175703, + "balance_loss_clip": 1.00239146, + "balance_loss_mlp": 1.00127578, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.524850159718355, + "language_loss": 0.77608138, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79925346, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.63456654548645 + }, + { + "auxiliary_loss_clip": 0.01171176, + "auxiliary_loss_mlp": 0.01176183, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.00127947, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.901208306366608, + "language_loss": 0.80246222, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82593578, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 3.9997267723083496 + }, + { + "auxiliary_loss_clip": 0.01154545, + "auxiliary_loss_mlp": 0.00749541, + "balance_loss_clip": 1.00235772, + "balance_loss_mlp": 1.00060678, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.6887662175970504, + "language_loss": 0.73318803, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75222886, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 3.9742324352264404 + }, + { + "auxiliary_loss_clip": 0.01171697, + "auxiliary_loss_mlp": 0.01177083, + "balance_loss_clip": 1.0024004, + "balance_loss_mlp": 1.00198877, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 2.5462220019304675, + "language_loss": 0.85785055, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88133836, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 3.984163522720337 + }, + { + "auxiliary_loss_clip": 0.01154758, + "auxiliary_loss_mlp": 0.01176302, + "balance_loss_clip": 1.0023005, + "balance_loss_mlp": 1.00158858, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.6573465386801172, + "language_loss": 0.82227921, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84558976, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.01171753, + "auxiliary_loss_mlp": 0.00749591, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00063312, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.496806539037224, + "language_loss": 0.73552722, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75474066, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 3.9268958568573 + }, + { + "auxiliary_loss_clip": 0.01140844, + "auxiliary_loss_mlp": 0.00749605, + "balance_loss_clip": 1.00210559, + "balance_loss_mlp": 1.00061917, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.520590360213625, + "language_loss": 0.80070603, + "learning_rate": 3.974803756351379e-06, + "loss": 0.8196106, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.6375372409820557 + }, + { + "auxiliary_loss_clip": 0.01171357, + "auxiliary_loss_mlp": 0.01176123, + "balance_loss_clip": 1.00223064, + "balance_loss_mlp": 1.00150514, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.7275359596328963, + "language_loss": 0.74047041, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76394522, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.6041204929351807 + }, + { + "auxiliary_loss_clip": 0.01138455, + "auxiliary_loss_mlp": 0.01176318, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.00150919, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.8808252975676436, + "language_loss": 0.65611613, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67926383, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.6095657348632812 + }, + { + "auxiliary_loss_clip": 0.01141046, + "auxiliary_loss_mlp": 0.01176442, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.0013473, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 6.084131737568309, + "language_loss": 0.73162711, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75480205, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.6590447425842285 + }, + { + "auxiliary_loss_clip": 0.01122424, + "auxiliary_loss_mlp": 0.0117612, + "balance_loss_clip": 1.00225151, + "balance_loss_mlp": 1.00121617, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.6997625633670321, + "language_loss": 0.90653193, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92951739, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.699101448059082 + }, + { + "auxiliary_loss_clip": 0.01171574, + "auxiliary_loss_mlp": 0.01176068, + "balance_loss_clip": 1.0022552, + "balance_loss_mlp": 1.00145042, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.122488963560341, + "language_loss": 0.80090952, + "learning_rate": 3.974494692820539e-06, + "loss": 0.824386, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.5570437908172607 + }, + { + "auxiliary_loss_clip": 0.01154643, + "auxiliary_loss_mlp": 0.01176019, + "balance_loss_clip": 1.00229621, + "balance_loss_mlp": 1.00121021, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.9120772214859056, + "language_loss": 0.69087726, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71418393, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.569448471069336 + }, + { + "auxiliary_loss_clip": 0.01170881, + "auxiliary_loss_mlp": 0.01176141, + "balance_loss_clip": 1.00226784, + "balance_loss_mlp": 1.00152338, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.1048057165391465, + "language_loss": 0.83632481, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.85979503, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.5192606449127197 + }, + { + "auxiliary_loss_clip": 0.01187289, + "auxiliary_loss_mlp": 0.01175825, + "balance_loss_clip": 1.00228167, + "balance_loss_mlp": 1.00120699, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 4.240206311925487, + "language_loss": 0.90563291, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92926407, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.5002856254577637 + }, + { + "auxiliary_loss_clip": 0.0113916, + "auxiliary_loss_mlp": 0.01175878, + "balance_loss_clip": 1.00229216, + "balance_loss_mlp": 1.00126028, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 2.248289966117286, + "language_loss": 0.82405454, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84720492, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.6362783908843994 + }, + { + "auxiliary_loss_clip": 0.01154273, + "auxiliary_loss_mlp": 0.0117566, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00113726, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.8168802816294543, + "language_loss": 0.79139704, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81469643, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.608661651611328 + }, + { + "auxiliary_loss_clip": 0.01106588, + "auxiliary_loss_mlp": 0.00749558, + "balance_loss_clip": 1.0021565, + "balance_loss_mlp": 1.00058186, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.3648497716929726, + "language_loss": 0.88556993, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90413141, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.6952364444732666 + }, + { + "auxiliary_loss_clip": 0.01187184, + "auxiliary_loss_mlp": 0.01175725, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00129771, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 3.2122237054078737, + "language_loss": 0.83153403, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8551631, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.5112786293029785 + }, + { + "auxiliary_loss_clip": 0.011874, + "auxiliary_loss_mlp": 0.01175941, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00113237, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 3.9264375005305028, + "language_loss": 0.78977871, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81341213, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.468752861022949 + }, + { + "auxiliary_loss_clip": 0.01171161, + "auxiliary_loss_mlp": 0.0117597, + "balance_loss_clip": 1.00221753, + "balance_loss_mlp": 1.00116122, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.5541029629556347, + "language_loss": 0.74164104, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76511234, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.4968998432159424 + }, + { + "auxiliary_loss_clip": 0.01154377, + "auxiliary_loss_mlp": 0.01176013, + "balance_loss_clip": 1.00233412, + "balance_loss_mlp": 1.00139546, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.6476891185122196, + "language_loss": 0.81136715, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83467102, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.604459285736084 + }, + { + "auxiliary_loss_clip": 0.01187351, + "auxiliary_loss_mlp": 0.00749574, + "balance_loss_clip": 1.00233293, + "balance_loss_mlp": 1.00059342, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.761813466273557, + "language_loss": 0.88752222, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90689147, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.458976984024048 + }, + { + "auxiliary_loss_clip": 0.01171331, + "auxiliary_loss_mlp": 0.00749614, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00059199, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.2458698496142366, + "language_loss": 0.73286551, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75207496, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.7156999111175537 + }, + { + "auxiliary_loss_clip": 0.01156833, + "auxiliary_loss_mlp": 0.01175799, + "balance_loss_clip": 1.00228786, + "balance_loss_mlp": 1.00118113, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.8068490523590297, + "language_loss": 0.8282457, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85157204, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.56221342086792 + }, + { + "auxiliary_loss_clip": 0.01124772, + "auxiliary_loss_mlp": 0.01175525, + "balance_loss_clip": 1.0022223, + "balance_loss_mlp": 1.00119328, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.4684131645865683, + "language_loss": 0.75072545, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77372837, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.668992280960083 + }, + { + "auxiliary_loss_clip": 0.01154339, + "auxiliary_loss_mlp": 0.01175786, + "balance_loss_clip": 1.00209701, + "balance_loss_mlp": 1.00135863, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.900331327807726, + "language_loss": 0.80475569, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82805693, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.6066362857818604 + }, + { + "auxiliary_loss_clip": 0.0113756, + "auxiliary_loss_mlp": 0.01172684, + "balance_loss_clip": 1.00303245, + "balance_loss_mlp": 1.00025928, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.73728776386496, + "language_loss": 0.55986023, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58296263, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.239428758621216 + }, + { + "auxiliary_loss_clip": 0.01155153, + "auxiliary_loss_mlp": 0.0117574, + "balance_loss_clip": 1.00229096, + "balance_loss_mlp": 1.00131333, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.653138628102825, + "language_loss": 0.6796658, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.7029748, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.598376989364624 + }, + { + "auxiliary_loss_clip": 0.01154616, + "auxiliary_loss_mlp": 0.01175838, + "balance_loss_clip": 1.00224757, + "balance_loss_mlp": 1.0016017, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.769117140756105, + "language_loss": 0.87053573, + "learning_rate": 3.973366567512453e-06, + "loss": 0.89384025, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.650423765182495 + }, + { + "auxiliary_loss_clip": 0.01122557, + "auxiliary_loss_mlp": 0.01175782, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00116432, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.5269233133759497, + "language_loss": 0.87093449, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89391792, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.6284844875335693 + }, + { + "auxiliary_loss_clip": 0.01171016, + "auxiliary_loss_mlp": 0.01175776, + "balance_loss_clip": 1.00235343, + "balance_loss_mlp": 1.00154018, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 4.571497014816667, + "language_loss": 0.89404029, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91750824, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.529641628265381 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01172495, + "balance_loss_clip": 1.00308084, + "balance_loss_mlp": 1.00007105, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8867233574665884, + "language_loss": 0.64787757, + "learning_rate": 3.97317618909838e-06, + "loss": 0.67131197, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.0238475799560547 + }, + { + "auxiliary_loss_clip": 0.01171185, + "auxiliary_loss_mlp": 0.01175844, + "balance_loss_clip": 1.00217402, + "balance_loss_mlp": 1.00132132, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.859914392500039, + "language_loss": 0.89806676, + "learning_rate": 3.973112579977733e-06, + "loss": 0.92153704, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.5133867263793945 + }, + { + "auxiliary_loss_clip": 0.01154697, + "auxiliary_loss_mlp": 0.01175644, + "balance_loss_clip": 1.00232041, + "balance_loss_mlp": 1.00112152, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.24735572007719, + "language_loss": 0.77003831, + "learning_rate": 3.973048896036459e-06, + "loss": 0.79334176, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.5638253688812256 + }, + { + "auxiliary_loss_clip": 0.01157948, + "auxiliary_loss_mlp": 0.01172516, + "balance_loss_clip": 1.00337458, + "balance_loss_mlp": 1.00009203, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8073231615957642, + "language_loss": 0.57470202, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59800667, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.069920063018799 + }, + { + "auxiliary_loss_clip": 0.01139418, + "auxiliary_loss_mlp": 0.01176027, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00131369, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.7538903739139022, + "language_loss": 0.8720606, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89521503, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.6136889457702637 + }, + { + "auxiliary_loss_clip": 0.01187255, + "auxiliary_loss_mlp": 0.01175596, + "balance_loss_clip": 1.00234127, + "balance_loss_mlp": 1.00107384, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 2.3241588393764303, + "language_loss": 0.87428558, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89791411, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.5215487480163574 + }, + { + "auxiliary_loss_clip": 0.01170374, + "auxiliary_loss_mlp": 0.01175767, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00114894, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6453845885973104, + "language_loss": 0.92821866, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95168006, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 3.9937872886657715 + }, + { + "auxiliary_loss_clip": 0.01173302, + "auxiliary_loss_mlp": 0.01175953, + "balance_loss_clip": 1.00241089, + "balance_loss_mlp": 1.00171673, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 2.685934405176823, + "language_loss": 0.89436132, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91785383, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 3.9192354679107666 + }, + { + "auxiliary_loss_clip": 0.01123817, + "auxiliary_loss_mlp": 0.01175859, + "balance_loss_clip": 1.00238824, + "balance_loss_mlp": 1.00124145, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.5919551411298698, + "language_loss": 0.76662529, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78962207, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.657322645187378 + }, + { + "auxiliary_loss_clip": 0.01187208, + "auxiliary_loss_mlp": 0.01175813, + "balance_loss_clip": 1.00231051, + "balance_loss_mlp": 1.00148141, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.61956344481301, + "language_loss": 0.88629425, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90992451, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 3.9121031761169434 + }, + { + "auxiliary_loss_clip": 0.01140362, + "auxiliary_loss_mlp": 0.0074963, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00068295, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.0305086132318113, + "language_loss": 0.82518727, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84408712, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 4.061012506484985 + }, + { + "auxiliary_loss_clip": 0.01187162, + "auxiliary_loss_mlp": 0.01175958, + "balance_loss_clip": 1.00220406, + "balance_loss_mlp": 1.00124454, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 2.9788311817353725, + "language_loss": 0.75575542, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77938664, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.52419114112854 + }, + { + "auxiliary_loss_clip": 0.01170623, + "auxiliary_loss_mlp": 0.00749611, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00068831, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 2.2800005736996805, + "language_loss": 0.83191216, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85111445, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.571878671646118 + }, + { + "auxiliary_loss_clip": 0.01170921, + "auxiliary_loss_mlp": 0.01172478, + "balance_loss_clip": 1.0035125, + "balance_loss_mlp": 1.00005364, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8523309467096758, + "language_loss": 0.59726691, + "learning_rate": 3.972343435213775e-06, + "loss": 0.6207009, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.145779848098755 + }, + { + "auxiliary_loss_clip": 0.0113799, + "auxiliary_loss_mlp": 0.01176179, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00156164, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.8283383311020873, + "language_loss": 0.82959628, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85273796, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.6133031845092773 + }, + { + "auxiliary_loss_clip": 0.01173188, + "auxiliary_loss_mlp": 0.01175965, + "balance_loss_clip": 1.00231409, + "balance_loss_mlp": 1.0015384, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 9.836750857345061, + "language_loss": 0.7122947, + "learning_rate": 3.972214197225521e-06, + "loss": 0.7357862, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.5158443450927734 + }, + { + "auxiliary_loss_clip": 0.01170939, + "auxiliary_loss_mlp": 0.01175803, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00109017, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1.9786210410579375, + "language_loss": 0.70349932, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72696674, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.5671122074127197 + }, + { + "auxiliary_loss_clip": 0.01170667, + "auxiliary_loss_mlp": 0.01175947, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00132966, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 3.3784796924066107, + "language_loss": 0.84818941, + "learning_rate": 3.97208466009103e-06, + "loss": 0.87165558, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.534675359725952 + }, + { + "auxiliary_loss_clip": 0.01154128, + "auxiliary_loss_mlp": 0.01175897, + "balance_loss_clip": 1.00210488, + "balance_loss_mlp": 1.00137448, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9960489397628736, + "language_loss": 1.02495563, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04825592, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.595313549041748 + }, + { + "auxiliary_loss_clip": 0.01104895, + "auxiliary_loss_mlp": 0.0117625, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00153649, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.4667510989020482, + "language_loss": 0.83480966, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85762119, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.7447686195373535 + }, + { + "auxiliary_loss_clip": 0.01187179, + "auxiliary_loss_mlp": 0.01176576, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.00176716, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 49.603030801998905, + "language_loss": 0.72848028, + "learning_rate": 3.971889793533093e-06, + "loss": 0.75211787, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.4919002056121826 + }, + { + "auxiliary_loss_clip": 0.01153809, + "auxiliary_loss_mlp": 0.01175578, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00134194, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 4.113810038497482, + "language_loss": 0.77024829, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79354215, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.597641706466675 + }, + { + "auxiliary_loss_clip": 0.01187242, + "auxiliary_loss_mlp": 0.01175925, + "balance_loss_clip": 1.00230348, + "balance_loss_mlp": 1.00130713, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 3.184039774935825, + "language_loss": 0.72815067, + "learning_rate": 3.971759508619069e-06, + "loss": 0.75178236, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.4754703044891357 + }, + { + "auxiliary_loss_clip": 0.01187158, + "auxiliary_loss_mlp": 0.01175966, + "balance_loss_clip": 1.00227225, + "balance_loss_mlp": 1.00153947, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 2.050287632399122, + "language_loss": 0.77555245, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79918373, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.644068479537964 + }, + { + "auxiliary_loss_clip": 0.01124759, + "auxiliary_loss_mlp": 0.01175682, + "balance_loss_clip": 1.00233388, + "balance_loss_mlp": 1.00125456, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.007456545734724, + "language_loss": 0.82057744, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84358186, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.6473190784454346 + }, + { + "auxiliary_loss_clip": 0.01171043, + "auxiliary_loss_mlp": 0.01175848, + "balance_loss_clip": 1.00238597, + "balance_loss_mlp": 1.00151622, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 2.564908184641466, + "language_loss": 0.82084912, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84431803, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.555690288543701 + }, + { + "auxiliary_loss_clip": 0.01138334, + "auxiliary_loss_mlp": 0.0117612, + "balance_loss_clip": 1.00200808, + "balance_loss_mlp": 1.00169325, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 1.7988765927652106, + "language_loss": 0.81748545, + "learning_rate": 3.97149804157902e-06, + "loss": 0.84063005, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.5807418823242188 + }, + { + "auxiliary_loss_clip": 0.01187169, + "auxiliary_loss_mlp": 0.01175946, + "balance_loss_clip": 1.00231826, + "balance_loss_mlp": 1.0014236, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.388005672537933, + "language_loss": 0.84063876, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86426991, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.4718070030212402 + }, + { + "auxiliary_loss_clip": 0.01137638, + "auxiliary_loss_mlp": 0.01175298, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00125206, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.9426256673604825, + "language_loss": 0.81314117, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83627051, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.641726493835449 + }, + { + "auxiliary_loss_clip": 0.01121441, + "auxiliary_loss_mlp": 0.00749602, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00065875, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.200630021277952, + "language_loss": 0.75035286, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76906329, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.7299273014068604 + }, + { + "auxiliary_loss_clip": 0.01120894, + "auxiliary_loss_mlp": 0.01176107, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.0013938, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6619052042565463, + "language_loss": 0.74474663, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76771659, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.652027130126953 + }, + { + "auxiliary_loss_clip": 0.01071676, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_clip": 1.00155616, + "balance_loss_mlp": 1.00134587, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.217382972286674, + "language_loss": 0.71308899, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73555958, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 2.9575512409210205 + }, + { + "auxiliary_loss_clip": 0.01141578, + "auxiliary_loss_mlp": 0.01175554, + "balance_loss_clip": 1.00246572, + "balance_loss_mlp": 1.00093603, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.643835808954914, + "language_loss": 0.88025004, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.9034214, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 3.1733245849609375 + }, + { + "auxiliary_loss_clip": 0.01138534, + "auxiliary_loss_mlp": 0.01175592, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00126076, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.8313941523436137, + "language_loss": 0.82380688, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84694815, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.666679859161377 + }, + { + "auxiliary_loss_clip": 0.01123192, + "auxiliary_loss_mlp": 0.01172051, + "balance_loss_clip": 1.00430238, + "balance_loss_mlp": 1.00038958, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8240273756628516, + "language_loss": 0.60689569, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62984812, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.254887819290161 + }, + { + "auxiliary_loss_clip": 0.01155278, + "auxiliary_loss_mlp": 0.01171616, + "balance_loss_clip": 1.00375199, + "balance_loss_mlp": 0.99995416, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9115541234265345, + "language_loss": 0.62186962, + "learning_rate": 3.970905367556871e-06, + "loss": 0.6451385, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.1016790866851807 + }, + { + "auxiliary_loss_clip": 0.01124064, + "auxiliary_loss_mlp": 0.01176099, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00167215, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.7850964740578892, + "language_loss": 0.8283813, + "learning_rate": 3.970839141169718e-06, + "loss": 0.85138297, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.672132968902588 + }, + { + "auxiliary_loss_clip": 0.01154026, + "auxiliary_loss_mlp": 0.01175499, + "balance_loss_clip": 1.0020324, + "balance_loss_mlp": 1.00135756, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 2.471120060652478, + "language_loss": 0.84870625, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87200153, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.617551565170288 + }, + { + "auxiliary_loss_clip": 0.01170306, + "auxiliary_loss_mlp": 0.01175738, + "balance_loss_clip": 1.00202358, + "balance_loss_mlp": 1.00131083, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 4.999395763588875, + "language_loss": 0.87310421, + "learning_rate": 3.970706464194672e-06, + "loss": 0.8965646, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.5868518352508545 + }, + { + "auxiliary_loss_clip": 0.01140737, + "auxiliary_loss_mlp": 0.01175338, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00129259, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 5.031733612725446, + "language_loss": 0.78767645, + "learning_rate": 3.970640013611812e-06, + "loss": 0.81083715, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.7706198692321777 + }, + { + "auxiliary_loss_clip": 0.0117027, + "auxiliary_loss_mlp": 0.01175429, + "balance_loss_clip": 1.00214636, + "balance_loss_mlp": 1.00119305, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.5022086587771852, + "language_loss": 0.86440897, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88786596, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.521630048751831 + }, + { + "auxiliary_loss_clip": 0.01170986, + "auxiliary_loss_mlp": 0.00749647, + "balance_loss_clip": 1.00221229, + "balance_loss_mlp": 1.00069785, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 2.875695955811881, + "language_loss": 0.88522816, + "learning_rate": 3.970506888268011e-06, + "loss": 0.9044345, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.5249547958374023 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01175551, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00141001, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.158295365569896, + "language_loss": 0.77665377, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79978818, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.6081557273864746 + }, + { + "auxiliary_loss_clip": 0.01170587, + "auxiliary_loss_mlp": 0.01175841, + "balance_loss_clip": 1.00213754, + "balance_loss_mlp": 1.00150919, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 3.460892556574991, + "language_loss": 0.83097726, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85444152, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 3.9731597900390625 + }, + { + "auxiliary_loss_clip": 0.01122017, + "auxiliary_loss_mlp": 0.01175585, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00125349, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.8947255102376284, + "language_loss": 0.84998614, + "learning_rate": 3.970306639845e-06, + "loss": 0.87296218, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 4.1638195514678955 + }, + { + "auxiliary_loss_clip": 0.01137895, + "auxiliary_loss_mlp": 0.01175605, + "balance_loss_clip": 1.00201046, + "balance_loss_mlp": 1.001369, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 1.8093142977615615, + "language_loss": 0.68985236, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71298742, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.6084327697753906 + }, + { + "auxiliary_loss_clip": 0.01170748, + "auxiliary_loss_mlp": 0.01175478, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00124145, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7759566640262412, + "language_loss": 0.81901753, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84247977, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 3.9880082607269287 + }, + { + "auxiliary_loss_clip": 0.0115358, + "auxiliary_loss_mlp": 0.01175774, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00144243, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.4069974722256946, + "language_loss": 0.76923466, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79252815, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.5544841289520264 + }, + { + "auxiliary_loss_clip": 0.01104625, + "auxiliary_loss_mlp": 0.01175428, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00147808, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 6.626627947081242, + "language_loss": 0.79569745, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81849802, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 4.080021381378174 + }, + { + "auxiliary_loss_clip": 0.01154163, + "auxiliary_loss_mlp": 0.01175737, + "balance_loss_clip": 1.00225043, + "balance_loss_mlp": 1.00130987, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.427601708729306, + "language_loss": 0.87621295, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89951199, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.5598714351654053 + }, + { + "auxiliary_loss_clip": 0.01154129, + "auxiliary_loss_mlp": 0.01175574, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.0014329, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 2.5559843837862704, + "language_loss": 0.86980152, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89309859, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.621260643005371 + }, + { + "auxiliary_loss_clip": 0.01121831, + "auxiliary_loss_mlp": 0.01176026, + "balance_loss_clip": 1.00213325, + "balance_loss_mlp": 1.00159907, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 3.8682572557552657, + "language_loss": 0.88001072, + "learning_rate": 3.969836778645371e-06, + "loss": 0.90298927, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.631094217300415 + }, + { + "auxiliary_loss_clip": 0.01170287, + "auxiliary_loss_mlp": 0.01175702, + "balance_loss_clip": 1.00208414, + "balance_loss_mlp": 1.00165653, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.8021729144076195, + "language_loss": 0.80916172, + "learning_rate": 3.969769356810819e-06, + "loss": 0.83262157, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.5736098289489746 + }, + { + "auxiliary_loss_clip": 0.01186835, + "auxiliary_loss_mlp": 0.01175129, + "balance_loss_clip": 1.00233483, + "balance_loss_mlp": 1.00117898, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8871650905617874, + "language_loss": 0.85102427, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87464392, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.5437402725219727 + }, + { + "auxiliary_loss_clip": 0.01121111, + "auxiliary_loss_mlp": 0.01175405, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.0012641, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 3.683449259060844, + "language_loss": 0.83027756, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85324275, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.6407363414764404 + }, + { + "auxiliary_loss_clip": 0.011703, + "auxiliary_loss_mlp": 0.0074964, + "balance_loss_clip": 1.00224161, + "balance_loss_mlp": 1.00072908, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 2.5606648149955173, + "language_loss": 0.82995141, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84915078, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01170361, + "auxiliary_loss_mlp": 0.01175317, + "balance_loss_clip": 1.00225556, + "balance_loss_mlp": 1.00136709, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3268123519921557, + "language_loss": 0.76859462, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79205143, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.555910587310791 + }, + { + "auxiliary_loss_clip": 0.01137473, + "auxiliary_loss_mlp": 0.01175112, + "balance_loss_clip": 1.00233924, + "balance_loss_mlp": 1.00106668, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.0477684321581737, + "language_loss": 0.78018481, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80331069, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.6994009017944336 + }, + { + "auxiliary_loss_clip": 0.0118677, + "auxiliary_loss_mlp": 0.01175427, + "balance_loss_clip": 1.00222325, + "balance_loss_mlp": 1.00128579, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.7809797797426046, + "language_loss": 0.95060498, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97422695, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.479696035385132 + }, + { + "auxiliary_loss_clip": 0.01171193, + "auxiliary_loss_mlp": 0.01175272, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00113106, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.6703150384727397, + "language_loss": 0.81949544, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84296, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.5824899673461914 + }, + { + "auxiliary_loss_clip": 0.01153728, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00106001, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.966543570540842, + "language_loss": 0.86798823, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89127755, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.6551458835601807 + }, + { + "auxiliary_loss_clip": 0.01186807, + "auxiliary_loss_mlp": 0.01175662, + "balance_loss_clip": 1.00215137, + "balance_loss_mlp": 1.00152099, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2759480477531966, + "language_loss": 0.87186641, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89549112, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.4910497665405273 + }, + { + "auxiliary_loss_clip": 0.01140185, + "auxiliary_loss_mlp": 0.00749621, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00067639, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 32.01718422577753, + "language_loss": 0.89242887, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.91132689, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.7968497276306152 + }, + { + "auxiliary_loss_clip": 0.01153544, + "auxiliary_loss_mlp": 0.0117499, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00103998, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 3.530906337029186, + "language_loss": 0.80199671, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82528198, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.598021984100342 + }, + { + "auxiliary_loss_clip": 0.01154242, + "auxiliary_loss_mlp": 0.01175646, + "balance_loss_clip": 1.00208318, + "balance_loss_mlp": 1.0016005, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1311041317058708, + "language_loss": 0.83764613, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86094499, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.548398733139038 + }, + { + "auxiliary_loss_clip": 0.0117068, + "auxiliary_loss_mlp": 0.01174937, + "balance_loss_clip": 1.00216889, + "balance_loss_mlp": 1.00108206, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.754259790801649, + "language_loss": 0.80386627, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82732242, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.5667924880981445 + }, + { + "auxiliary_loss_clip": 0.01156405, + "auxiliary_loss_mlp": 0.01175483, + "balance_loss_clip": 1.00219154, + "balance_loss_mlp": 1.00153291, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 2.673785116828843, + "language_loss": 0.79822612, + "learning_rate": 3.96881760944111e-06, + "loss": 0.821545, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.5762624740600586 + }, + { + "auxiliary_loss_clip": 0.01170602, + "auxiliary_loss_mlp": 0.01175186, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00123572, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 3.704151617309895, + "language_loss": 0.9203496, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94380742, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.522933006286621 + }, + { + "auxiliary_loss_clip": 0.01155178, + "auxiliary_loss_mlp": 0.01171881, + "balance_loss_clip": 1.0038023, + "balance_loss_mlp": 1.00021994, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8850514672526567, + "language_loss": 0.61764896, + "learning_rate": 3.968680450841368e-06, + "loss": 0.64091957, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.2333970069885254 + }, + { + "auxiliary_loss_clip": 0.01186694, + "auxiliary_loss_mlp": 0.0117482, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00134683, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 1.8416485968926248, + "language_loss": 0.86928499, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89290023, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.49344539642334 + }, + { + "auxiliary_loss_clip": 0.01170264, + "auxiliary_loss_mlp": 0.01175594, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.00145328, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.2051694839533034, + "language_loss": 0.74528539, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76874399, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.5532138347625732 + }, + { + "auxiliary_loss_clip": 0.01187992, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_clip": 1.00430679, + "balance_loss_mlp": 0.99998456, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9030344389241047, + "language_loss": 0.56729758, + "learning_rate": 3.968474153054073e-06, + "loss": 0.59089398, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.0161240100860596 + }, + { + "auxiliary_loss_clip": 0.01154099, + "auxiliary_loss_mlp": 0.01175506, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00146008, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.037833038018945, + "language_loss": 0.89393783, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91723382, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.536994218826294 + }, + { + "auxiliary_loss_clip": 0.01153415, + "auxiliary_loss_mlp": 0.01175139, + "balance_loss_clip": 1.00203097, + "balance_loss_mlp": 1.00109339, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.5578590686097082, + "language_loss": 0.88270766, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90599322, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.567655324935913 + }, + { + "auxiliary_loss_clip": 0.01153497, + "auxiliary_loss_mlp": 0.01175361, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00150585, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.670011399766574, + "language_loss": 0.77270496, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79599357, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.5453906059265137 + }, + { + "auxiliary_loss_clip": 0.01170696, + "auxiliary_loss_mlp": 0.01175083, + "balance_loss_clip": 1.00211954, + "balance_loss_mlp": 1.00122797, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 3.477600857250909, + "language_loss": 0.71009159, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73354936, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.8825786113739014 + }, + { + "auxiliary_loss_clip": 0.01153687, + "auxiliary_loss_mlp": 0.0117499, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.00132644, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 5.298483300364746, + "language_loss": 0.74696112, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77024794, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.6239492893218994 + }, + { + "auxiliary_loss_clip": 0.01154125, + "auxiliary_loss_mlp": 0.01175005, + "balance_loss_clip": 1.00216222, + "balance_loss_mlp": 1.00095928, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.9278664075125853, + "language_loss": 0.82032883, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84362006, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.5716962814331055 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.01171669, + "balance_loss_clip": 1.0038991, + "balance_loss_mlp": 1.00000787, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8488297544085508, + "language_loss": 0.56590796, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58901411, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.164870500564575 + }, + { + "auxiliary_loss_clip": 0.01186692, + "auxiliary_loss_mlp": 0.01175188, + "balance_loss_clip": 1.00210309, + "balance_loss_mlp": 1.00133324, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.328897382507239, + "language_loss": 0.69881767, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72243649, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.554713487625122 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.0117471, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00104618, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 2.2036463347039237, + "language_loss": 0.88509679, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90822768, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.665923595428467 + }, + { + "auxiliary_loss_clip": 0.01187716, + "auxiliary_loss_mlp": 0.01171832, + "balance_loss_clip": 1.00418687, + "balance_loss_mlp": 1.00017107, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7970499024521565, + "language_loss": 0.63512266, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65871811, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 4.4780378341674805 + }, + { + "auxiliary_loss_clip": 0.01136672, + "auxiliary_loss_mlp": 0.01175046, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00138164, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.341314996447194, + "language_loss": 0.83194441, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85506159, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 4.031719446182251 + }, + { + "auxiliary_loss_clip": 0.01138445, + "auxiliary_loss_mlp": 0.01175103, + "balance_loss_clip": 1.00210905, + "balance_loss_mlp": 1.00143921, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.6986645221827985, + "language_loss": 0.75415337, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77728891, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 4.085886478424072 + }, + { + "auxiliary_loss_clip": 0.01120616, + "auxiliary_loss_mlp": 0.01175257, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00168812, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9729692811064738, + "language_loss": 0.76417196, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78713065, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.634971857070923 + }, + { + "auxiliary_loss_clip": 0.01186723, + "auxiliary_loss_mlp": 0.01174924, + "balance_loss_clip": 1.00229883, + "balance_loss_mlp": 1.00116444, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.3315186914828336, + "language_loss": 0.93118465, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95480114, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 4.001968145370483 + }, + { + "auxiliary_loss_clip": 0.01105087, + "auxiliary_loss_mlp": 0.01175363, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00150824, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 3.6755191344565494, + "language_loss": 0.75233471, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77513927, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.6547958850860596 + }, + { + "auxiliary_loss_clip": 0.01186737, + "auxiliary_loss_mlp": 0.01175088, + "balance_loss_clip": 1.00227332, + "balance_loss_mlp": 1.00123334, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.9734551544359125, + "language_loss": 0.81703579, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84065402, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.4594504833221436 + }, + { + "auxiliary_loss_clip": 0.01170743, + "auxiliary_loss_mlp": 0.01175332, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.001477, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 1.9550312124388987, + "language_loss": 0.80478972, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82825053, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.5870397090911865 + }, + { + "auxiliary_loss_clip": 0.01153494, + "auxiliary_loss_mlp": 0.01175052, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00129306, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 1.9446655499561014, + "language_loss": 0.88241953, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90570503, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.5568687915802 + }, + { + "auxiliary_loss_clip": 0.01137584, + "auxiliary_loss_mlp": 0.01175174, + "balance_loss_clip": 1.00215697, + "balance_loss_mlp": 1.00179636, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 2.1683205125550336, + "language_loss": 0.81755567, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84068328, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.6622889041900635 + }, + { + "auxiliary_loss_clip": 0.01137113, + "auxiliary_loss_mlp": 0.01175347, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00130117, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.5710899135935132, + "language_loss": 0.77950776, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80263233, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.645266532897949 + }, + { + "auxiliary_loss_clip": 0.01153735, + "auxiliary_loss_mlp": 0.01175226, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00146651, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 1.9008348231905445, + "language_loss": 0.73189378, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75518334, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.5611658096313477 + }, + { + "auxiliary_loss_clip": 0.01137739, + "auxiliary_loss_mlp": 0.00749626, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00073647, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 3.110133391929507, + "language_loss": 0.85296822, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87184191, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.5851705074310303 + }, + { + "auxiliary_loss_clip": 0.01153747, + "auxiliary_loss_mlp": 0.01174765, + "balance_loss_clip": 1.0020045, + "balance_loss_mlp": 1.00119591, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.371349550711528, + "language_loss": 0.79275161, + "learning_rate": 3.966870223147707e-06, + "loss": 0.8160367, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.553651809692383 + }, + { + "auxiliary_loss_clip": 0.01141299, + "auxiliary_loss_mlp": 0.01171768, + "balance_loss_clip": 1.0036962, + "balance_loss_mlp": 1.00010669, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8712558961909138, + "language_loss": 0.57905811, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60218877, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.3157565593719482 + }, + { + "auxiliary_loss_clip": 0.01170597, + "auxiliary_loss_mlp": 0.01174657, + "balance_loss_clip": 1.00225568, + "balance_loss_mlp": 1.00118387, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.0945020112752486, + "language_loss": 0.69309556, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71654809, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.618589401245117 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01174756, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00118732, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.483898425728696, + "language_loss": 0.72737718, + "learning_rate": 3.966658105434627e-06, + "loss": 0.75017625, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.6636037826538086 + }, + { + "auxiliary_loss_clip": 0.01169874, + "auxiliary_loss_mlp": 0.01174939, + "balance_loss_clip": 1.00218153, + "balance_loss_mlp": 1.0011797, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.606076851122317, + "language_loss": 0.64273262, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66618073, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.6625654697418213 + }, + { + "auxiliary_loss_clip": 0.01137051, + "auxiliary_loss_mlp": 0.0117439, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00101161, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 1.9242036660995534, + "language_loss": 0.8752948, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89840919, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.655996561050415 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.00749637, + "balance_loss_clip": 1.00201762, + "balance_loss_mlp": 1.00075698, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.1551342198151646, + "language_loss": 0.83469212, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85355711, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.633918523788452 + }, + { + "auxiliary_loss_clip": 0.0118702, + "auxiliary_loss_mlp": 0.01171597, + "balance_loss_clip": 1.00367451, + "balance_loss_mlp": 0.99993593, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8476912321185394, + "language_loss": 0.60461032, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62819648, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.2277417182922363 + }, + { + "auxiliary_loss_clip": 0.01153351, + "auxiliary_loss_mlp": 0.0117496, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00110519, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.8849779112615574, + "language_loss": 0.79997241, + "learning_rate": 3.96630308443127e-06, + "loss": 0.82325554, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.6057486534118652 + }, + { + "auxiliary_loss_clip": 0.01169951, + "auxiliary_loss_mlp": 0.01174777, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.00101745, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 6.101209386031659, + "language_loss": 0.82237816, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84582543, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.594060182571411 + }, + { + "auxiliary_loss_clip": 0.01186726, + "auxiliary_loss_mlp": 0.01174739, + "balance_loss_clip": 1.00228357, + "balance_loss_mlp": 1.00097907, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.962269006998703, + "language_loss": 0.86605978, + "learning_rate": 3.966160554074189e-06, + "loss": 0.88967443, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.484741687774658 + }, + { + "auxiliary_loss_clip": 0.01170076, + "auxiliary_loss_mlp": 0.01174771, + "balance_loss_clip": 1.00233126, + "balance_loss_mlp": 1.00129795, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.8558187938825528, + "language_loss": 0.82250023, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8459487, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.5325801372528076 + }, + { + "auxiliary_loss_clip": 0.01170267, + "auxiliary_loss_mlp": 0.01170871, + "balance_loss_clip": 1.00360441, + "balance_loss_mlp": 0.99997294, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.7257233398561168, + "language_loss": 0.54763067, + "learning_rate": 3.966017725489091e-06, + "loss": 0.57104206, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.14949369430542 + }, + { + "auxiliary_loss_clip": 0.01137273, + "auxiliary_loss_mlp": 0.01174527, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00114894, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 3.0541908146963914, + "language_loss": 0.84868652, + "learning_rate": 3.965946199367804e-06, + "loss": 0.87180454, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.576914072036743 + }, + { + "auxiliary_loss_clip": 0.01186787, + "auxiliary_loss_mlp": 0.0117485, + "balance_loss_clip": 1.0023607, + "balance_loss_mlp": 1.00118577, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.0294908634212496, + "language_loss": 0.80185074, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82546711, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.4589946269989014 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01174791, + "balance_loss_clip": 1.00235164, + "balance_loss_mlp": 1.001127, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.797934983151659, + "language_loss": 0.7067976, + "learning_rate": 3.965802923481313e-06, + "loss": 0.72992206, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.7662155628204346 + }, + { + "auxiliary_loss_clip": 0.01121072, + "auxiliary_loss_mlp": 0.01174697, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00103319, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 2.6363674556841343, + "language_loss": 0.83629459, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85925233, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.6087653636932373 + }, + { + "auxiliary_loss_clip": 0.01120037, + "auxiliary_loss_mlp": 0.00749553, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00062084, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.3140751342481183, + "language_loss": 0.74518776, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76388371, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.6768527030944824 + }, + { + "auxiliary_loss_clip": 0.01154301, + "auxiliary_loss_mlp": 0.01174922, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00135362, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 9.04069015955427, + "language_loss": 0.80013764, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82342982, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.552079200744629 + }, + { + "auxiliary_loss_clip": 0.01153634, + "auxiliary_loss_mlp": 0.01174749, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00146604, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 2.454555226840735, + "language_loss": 0.71499336, + "learning_rate": 3.96551547720879e-06, + "loss": 0.7382772, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.579530954360962 + }, + { + "auxiliary_loss_clip": 0.01171115, + "auxiliary_loss_mlp": 0.01171459, + "balance_loss_clip": 1.00358558, + "balance_loss_mlp": 0.999798, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7857841803643418, + "language_loss": 0.58557796, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60900372, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.154271125793457 + }, + { + "auxiliary_loss_clip": 0.01186492, + "auxiliary_loss_mlp": 0.01174442, + "balance_loss_clip": 1.00226343, + "balance_loss_mlp": 1.00115919, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.6806574830915002, + "language_loss": 0.7757172, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79932654, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.6451849937438965 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_clip": 1.0020473, + "balance_loss_mlp": 1.0011344, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.1164771139006024, + "language_loss": 0.72199047, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74495214, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.7779409885406494 + }, + { + "auxiliary_loss_clip": 0.01170056, + "auxiliary_loss_mlp": 0.01174491, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00120878, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.8801977984988456, + "language_loss": 0.86313462, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88658011, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.61185884475708 + }, + { + "auxiliary_loss_clip": 0.01154107, + "auxiliary_loss_mlp": 0.0117444, + "balance_loss_clip": 1.00238109, + "balance_loss_mlp": 1.0013479, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 3.0274315798444316, + "language_loss": 0.80868274, + "learning_rate": 3.965154492406486e-06, + "loss": 0.83196819, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 3.9841911792755127 + }, + { + "auxiliary_loss_clip": 0.01103167, + "auxiliary_loss_mlp": 0.01174556, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00117826, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.303966410339029, + "language_loss": 0.8458547, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.8686319, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 4.072586536407471 + }, + { + "auxiliary_loss_clip": 0.01169963, + "auxiliary_loss_mlp": 0.01174581, + "balance_loss_clip": 1.00208867, + "balance_loss_mlp": 1.00139427, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 4.341249501436945, + "language_loss": 0.80986822, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83331364, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.502228021621704 + }, + { + "auxiliary_loss_clip": 0.01156447, + "auxiliary_loss_mlp": 0.01174679, + "balance_loss_clip": 1.00242448, + "balance_loss_mlp": 1.00139618, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.8078749562102032, + "language_loss": 0.76181793, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78512913, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 3.997926712036133 + }, + { + "auxiliary_loss_clip": 0.01153917, + "auxiliary_loss_mlp": 0.01174681, + "balance_loss_clip": 1.00210178, + "balance_loss_mlp": 1.0011121, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 1.9335592845513043, + "language_loss": 0.75092614, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.77421206, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 3.9876010417938232 + }, + { + "auxiliary_loss_clip": 0.01170317, + "auxiliary_loss_mlp": 0.01174643, + "balance_loss_clip": 1.00214291, + "balance_loss_mlp": 1.00107455, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.076945609116266, + "language_loss": 0.83374238, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85719198, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.565000295639038 + }, + { + "auxiliary_loss_clip": 0.01154074, + "auxiliary_loss_mlp": 0.0117461, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00142229, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.3747902564264405, + "language_loss": 0.78587568, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80916256, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.57747483253479 + }, + { + "auxiliary_loss_clip": 0.01186481, + "auxiliary_loss_mlp": 0.01174955, + "balance_loss_clip": 1.00232315, + "balance_loss_mlp": 1.00167239, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.9949116505219442, + "language_loss": 0.84856004, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87217444, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.4954373836517334 + }, + { + "auxiliary_loss_clip": 0.01121554, + "auxiliary_loss_mlp": 0.00749572, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00069904, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.2469047754872853, + "language_loss": 0.83565187, + "learning_rate": 3.964573041885641e-06, + "loss": 0.85436308, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.6975784301757812 + }, + { + "auxiliary_loss_clip": 0.0116978, + "auxiliary_loss_mlp": 0.0117464, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00126171, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 2.1199423240923427, + "language_loss": 0.75518107, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77862525, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.556975841522217 + }, + { + "auxiliary_loss_clip": 0.01169909, + "auxiliary_loss_mlp": 0.01174644, + "balance_loss_clip": 1.00216329, + "balance_loss_mlp": 1.00145686, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.9653550339705546, + "language_loss": 0.80684412, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.8302896, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.539804220199585 + }, + { + "auxiliary_loss_clip": 0.01186491, + "auxiliary_loss_mlp": 0.01174669, + "balance_loss_clip": 1.0023365, + "balance_loss_mlp": 1.00148118, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 1.9271745398340374, + "language_loss": 0.77797604, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.80158758, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.474529981613159 + }, + { + "auxiliary_loss_clip": 0.01186528, + "auxiliary_loss_mlp": 0.01174911, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00162816, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.9563784456944957, + "language_loss": 0.84422934, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86784375, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.490675449371338 + }, + { + "auxiliary_loss_clip": 0.0114071, + "auxiliary_loss_mlp": 0.01174301, + "balance_loss_clip": 1.00253916, + "balance_loss_mlp": 1.00139976, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.70217869960649, + "language_loss": 0.83612835, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85927844, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.620654821395874 + }, + { + "auxiliary_loss_clip": 0.01153958, + "auxiliary_loss_mlp": 0.01174298, + "balance_loss_clip": 1.00220954, + "balance_loss_mlp": 1.00111103, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 3.159617252070954, + "language_loss": 0.83050841, + "learning_rate": 3.964133825052146e-06, + "loss": 0.853791, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.58266282081604 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01174625, + "balance_loss_clip": 1.00160003, + "balance_loss_mlp": 1.00153327, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 2.1798429053848873, + "language_loss": 0.78924274, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81203514, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.7606089115142822 + }, + { + "auxiliary_loss_clip": 0.01140257, + "auxiliary_loss_mlp": 0.01174407, + "balance_loss_clip": 1.00235105, + "balance_loss_mlp": 1.00122011, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.68852106613756, + "language_loss": 0.79226434, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81541097, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.6251769065856934 + }, + { + "auxiliary_loss_clip": 0.01186404, + "auxiliary_loss_mlp": 0.01174442, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.00115943, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 4.812290588533077, + "language_loss": 0.74440205, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76801056, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.6904308795928955 + }, + { + "auxiliary_loss_clip": 0.01153257, + "auxiliary_loss_mlp": 0.01174724, + "balance_loss_clip": 1.00207806, + "balance_loss_mlp": 1.00153708, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.5467194374720212, + "language_loss": 0.74632001, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76959985, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.677992105484009 + }, + { + "auxiliary_loss_clip": 0.01186412, + "auxiliary_loss_mlp": 0.01174193, + "balance_loss_clip": 1.00216579, + "balance_loss_mlp": 1.00090981, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 4.560248048569395, + "language_loss": 0.87015986, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89376593, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.506199359893799 + }, + { + "auxiliary_loss_clip": 0.01170482, + "auxiliary_loss_mlp": 0.01174481, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.0012939, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 2.0095686960739005, + "language_loss": 0.77702516, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80047482, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.5905187129974365 + }, + { + "auxiliary_loss_clip": 0.01156328, + "auxiliary_loss_mlp": 0.01174204, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00111187, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 4.448490271110257, + "language_loss": 0.78013086, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80343622, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.616079092025757 + }, + { + "auxiliary_loss_clip": 0.01169933, + "auxiliary_loss_mlp": 0.01174959, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.00148606, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.7788617285573383, + "language_loss": 0.67089373, + "learning_rate": 3.963544031823624e-06, + "loss": 0.69434261, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.5942418575286865 + }, + { + "auxiliary_loss_clip": 0.01121279, + "auxiliary_loss_mlp": 0.01174189, + "balance_loss_clip": 1.00199902, + "balance_loss_mlp": 1.00109708, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 3.7608431406240608, + "language_loss": 0.96312582, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98608053, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.6780457496643066 + }, + { + "auxiliary_loss_clip": 0.01154424, + "auxiliary_loss_mlp": 0.01174562, + "balance_loss_clip": 1.00236261, + "balance_loss_mlp": 1.00127923, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.1338357817908005, + "language_loss": 0.79180586, + "learning_rate": 3.96339583888261e-06, + "loss": 0.81509578, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.6603429317474365 + }, + { + "auxiliary_loss_clip": 0.01170515, + "auxiliary_loss_mlp": 0.01175298, + "balance_loss_clip": 1.00229156, + "balance_loss_mlp": 1.00163352, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.876238910708605, + "language_loss": 0.85200417, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87546229, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.5068442821502686 + }, + { + "auxiliary_loss_clip": 0.01186552, + "auxiliary_loss_mlp": 0.01174635, + "balance_loss_clip": 1.00231075, + "balance_loss_mlp": 1.00135231, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.6964069504131782, + "language_loss": 0.80134791, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82495975, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01170083, + "auxiliary_loss_mlp": 0.01174538, + "balance_loss_clip": 1.00228238, + "balance_loss_mlp": 1.00144649, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 1.9417460747974211, + "language_loss": 0.83185995, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85530615, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.5418787002563477 + }, + { + "auxiliary_loss_clip": 0.01136739, + "auxiliary_loss_mlp": 0.01174684, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00149655, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.9911799613671928, + "language_loss": 0.76669383, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78980803, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.6256942749023438 + }, + { + "auxiliary_loss_clip": 0.01154299, + "auxiliary_loss_mlp": 0.01174371, + "balance_loss_clip": 1.00215685, + "balance_loss_mlp": 1.00108838, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.377816028099712, + "language_loss": 0.82954335, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85283011, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.5839335918426514 + }, + { + "auxiliary_loss_clip": 0.01170458, + "auxiliary_loss_mlp": 0.01174078, + "balance_loss_clip": 1.00226271, + "balance_loss_mlp": 1.00108147, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 2.5001936244385887, + "language_loss": 0.72053361, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74397898, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 2.75813627243042 + }, + { + "auxiliary_loss_clip": 0.01136969, + "auxiliary_loss_mlp": 0.01174118, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00112176, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 2.209753746194923, + "language_loss": 0.89449251, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91760343, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.782147169113159 + }, + { + "auxiliary_loss_clip": 0.01169795, + "auxiliary_loss_mlp": 0.01174438, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.0011549, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 4.410984219436186, + "language_loss": 0.73467815, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75812042, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.5704920291900635 + }, + { + "auxiliary_loss_clip": 0.01186173, + "auxiliary_loss_mlp": 0.00749474, + "balance_loss_clip": 1.00220585, + "balance_loss_mlp": 1.00057054, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 2.3109128953738955, + "language_loss": 0.77033091, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.7896874, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.5211102962493896 + }, + { + "auxiliary_loss_clip": 0.01186347, + "auxiliary_loss_mlp": 0.01174517, + "balance_loss_clip": 1.0022881, + "balance_loss_mlp": 1.00123405, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.7523937369333784, + "language_loss": 0.71116334, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73477197, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.591905355453491 + }, + { + "auxiliary_loss_clip": 0.01186284, + "auxiliary_loss_mlp": 0.01174338, + "balance_loss_clip": 1.00226569, + "balance_loss_mlp": 1.00124645, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.081700097074388, + "language_loss": 0.87215388, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89576006, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.522371768951416 + }, + { + "auxiliary_loss_clip": 0.01088504, + "auxiliary_loss_mlp": 0.01174702, + "balance_loss_clip": 1.00210106, + "balance_loss_mlp": 1.00141954, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 2.1207254228119443, + "language_loss": 0.82880247, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85143453, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.6984570026397705 + }, + { + "auxiliary_loss_clip": 0.01170473, + "auxiliary_loss_mlp": 0.01174584, + "balance_loss_clip": 1.00238228, + "balance_loss_mlp": 1.0012058, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 2.4018749360785483, + "language_loss": 0.69847846, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72192907, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.5136234760284424 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01174275, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00118363, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 2.1343148539047734, + "language_loss": 0.80061257, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82388747, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 5.470320224761963 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01174659, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.00128078, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 3.9232790430456883, + "language_loss": 0.82843757, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.8512249, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.72245192527771 + }, + { + "auxiliary_loss_clip": 0.01153891, + "auxiliary_loss_mlp": 0.01174836, + "balance_loss_clip": 1.00221109, + "balance_loss_mlp": 1.00164843, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.942200462674295, + "language_loss": 0.78727597, + "learning_rate": 3.962199576140195e-06, + "loss": 0.81056333, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.556678056716919 + }, + { + "auxiliary_loss_clip": 0.01170465, + "auxiliary_loss_mlp": 0.00749509, + "balance_loss_clip": 1.00232565, + "balance_loss_mlp": 1.00061584, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.653030527202828, + "language_loss": 0.93469965, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95389938, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.559673309326172 + }, + { + "auxiliary_loss_clip": 0.01136519, + "auxiliary_loss_mlp": 0.01174654, + "balance_loss_clip": 1.00204074, + "balance_loss_mlp": 1.00127578, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.2017682539759704, + "language_loss": 0.74365604, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76676774, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 5.35478138923645 + }, + { + "auxiliary_loss_clip": 0.01137388, + "auxiliary_loss_mlp": 0.01170756, + "balance_loss_clip": 1.00298715, + "balance_loss_mlp": 0.99985784, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7274363152960078, + "language_loss": 0.58263028, + "learning_rate": 3.96197315593058e-06, + "loss": 0.6057117, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.1831605434417725 + }, + { + "auxiliary_loss_clip": 0.01154223, + "auxiliary_loss_mlp": 0.01174056, + "balance_loss_clip": 1.0020777, + "balance_loss_mlp": 1.00105906, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.418235688028135, + "language_loss": 0.69754845, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72083127, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.7557177543640137 + }, + { + "auxiliary_loss_clip": 0.01120959, + "auxiliary_loss_mlp": 0.01174592, + "balance_loss_clip": 1.00214803, + "balance_loss_mlp": 1.00140476, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.171765617165877, + "language_loss": 0.86317337, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88612884, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.6919355392456055 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01175011, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.00144231, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.842321924261245, + "language_loss": 0.72448546, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74761027, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 2.6174676418304443 + }, + { + "auxiliary_loss_clip": 0.01137012, + "auxiliary_loss_mlp": 0.01174257, + "balance_loss_clip": 1.00211334, + "balance_loss_mlp": 1.00106943, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.5486792248738523, + "language_loss": 0.809039, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83215165, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.590012311935425 + }, + { + "auxiliary_loss_clip": 0.01120987, + "auxiliary_loss_mlp": 0.01174213, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00131202, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 6.3317613076835295, + "language_loss": 0.76172352, + "learning_rate": 3.961594300988482e-06, + "loss": 0.7846756, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.7173538208007812 + }, + { + "auxiliary_loss_clip": 0.01153219, + "auxiliary_loss_mlp": 0.01170802, + "balance_loss_clip": 1.00294924, + "balance_loss_mlp": 0.99990314, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7412354815354201, + "language_loss": 0.57713294, + "learning_rate": 3.961518306836998e-06, + "loss": 0.60037315, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 3.0250489711761475 + }, + { + "auxiliary_loss_clip": 0.01153709, + "auxiliary_loss_mlp": 0.01174, + "balance_loss_clip": 1.00211215, + "balance_loss_mlp": 1.00119472, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 3.321858007222235, + "language_loss": 0.85028267, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87355971, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.56691575050354 + }, + { + "auxiliary_loss_clip": 0.01152866, + "auxiliary_loss_mlp": 0.01174463, + "balance_loss_clip": 1.00207269, + "balance_loss_mlp": 1.00127578, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.515505655366963, + "language_loss": 0.83994353, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86321682, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.6042182445526123 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01174201, + "balance_loss_clip": 1.00218713, + "balance_loss_mlp": 1.00101423, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 2.4593291817833673, + "language_loss": 0.85490566, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87818611, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.6066102981567383 + }, + { + "auxiliary_loss_clip": 0.01136864, + "auxiliary_loss_mlp": 0.01174353, + "balance_loss_clip": 1.00203621, + "balance_loss_mlp": 1.00126147, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 2.4124625348200537, + "language_loss": 0.85240769, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87551987, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.6710197925567627 + }, + { + "auxiliary_loss_clip": 0.01136716, + "auxiliary_loss_mlp": 0.01174024, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00112343, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.6502344735338803, + "language_loss": 0.86611664, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88922405, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.596360683441162 + }, + { + "auxiliary_loss_clip": 0.01169859, + "auxiliary_loss_mlp": 0.01174563, + "balance_loss_clip": 1.00232816, + "balance_loss_mlp": 1.00128078, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 2.119210913549601, + "language_loss": 0.86311436, + "learning_rate": 3.961060780028764e-06, + "loss": 0.88655853, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.5583086013793945 + }, + { + "auxiliary_loss_clip": 0.01119496, + "auxiliary_loss_mlp": 0.01174298, + "balance_loss_clip": 1.00198901, + "balance_loss_mlp": 1.00158811, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 4.049394292740019, + "language_loss": 0.90104067, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92397863, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.6887223720550537 + }, + { + "auxiliary_loss_clip": 0.01152762, + "auxiliary_loss_mlp": 0.01174254, + "balance_loss_clip": 1.00195563, + "balance_loss_mlp": 1.00144792, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 3.499042268202861, + "language_loss": 0.85222191, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87549204, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.622626781463623 + }, + { + "auxiliary_loss_clip": 0.0115328, + "auxiliary_loss_mlp": 0.01174071, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00136042, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5300547273501368, + "language_loss": 0.81242812, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83570164, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.6955227851867676 + }, + { + "auxiliary_loss_clip": 0.01169529, + "auxiliary_loss_mlp": 0.01174573, + "balance_loss_clip": 1.00220442, + "balance_loss_mlp": 1.00176692, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.7292309704594984, + "language_loss": 0.7773326, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80077356, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.524751901626587 + }, + { + "auxiliary_loss_clip": 0.01170154, + "auxiliary_loss_mlp": 0.01173922, + "balance_loss_clip": 1.00223505, + "balance_loss_mlp": 1.00111628, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 6.654263386948106, + "language_loss": 0.86666822, + "learning_rate": 3.960677462662594e-06, + "loss": 0.89010906, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.5462708473205566 + }, + { + "auxiliary_loss_clip": 0.01152761, + "auxiliary_loss_mlp": 0.01173961, + "balance_loss_clip": 1.00208151, + "balance_loss_mlp": 1.00106013, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 5.106814425828106, + "language_loss": 0.72954929, + "learning_rate": 3.96060057613046e-06, + "loss": 0.7528165, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.5876638889312744 + }, + { + "auxiliary_loss_clip": 0.01153246, + "auxiliary_loss_mlp": 0.01174257, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.0012604, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 6.310465847974083, + "language_loss": 0.86252952, + "learning_rate": 3.960523615252156e-06, + "loss": 0.88580453, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.564633369445801 + }, + { + "auxiliary_loss_clip": 0.01104775, + "auxiliary_loss_mlp": 0.01174476, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00157523, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.8698658781425241, + "language_loss": 0.84101391, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86380649, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.703934907913208 + }, + { + "auxiliary_loss_clip": 0.01186019, + "auxiliary_loss_mlp": 0.0117411, + "balance_loss_clip": 1.00222993, + "balance_loss_mlp": 1.00168538, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.8934085496709645, + "language_loss": 0.81023759, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83383888, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.549187421798706 + }, + { + "auxiliary_loss_clip": 0.01153544, + "auxiliary_loss_mlp": 0.00749478, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.00054049, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.3803241351197397, + "language_loss": 0.74526203, + "learning_rate": 3.960292286569418e-06, + "loss": 0.7642923, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.5537004470825195 + }, + { + "auxiliary_loss_clip": 0.01137288, + "auxiliary_loss_mlp": 0.01174051, + "balance_loss_clip": 1.00226772, + "balance_loss_mlp": 1.00124502, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.1111179738842174, + "language_loss": 0.8597182, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88283157, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.5751264095306396 + }, + { + "auxiliary_loss_clip": 0.01152772, + "auxiliary_loss_mlp": 0.01173833, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.00102723, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.606183699531973, + "language_loss": 0.74859953, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77186561, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.6447486877441406 + }, + { + "auxiliary_loss_clip": 0.01185935, + "auxiliary_loss_mlp": 0.01173804, + "balance_loss_clip": 1.00219691, + "balance_loss_mlp": 1.00118935, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 2.2913310655227357, + "language_loss": 0.7672317, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79082906, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.4752440452575684 + }, + { + "auxiliary_loss_clip": 0.01169289, + "auxiliary_loss_mlp": 0.01173714, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00119424, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 4.1453444694748525, + "language_loss": 0.78436673, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80779678, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.550900936126709 + }, + { + "auxiliary_loss_clip": 0.01137607, + "auxiliary_loss_mlp": 0.01173971, + "balance_loss_clip": 1.00212991, + "balance_loss_mlp": 1.00106955, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 4.015751387681989, + "language_loss": 0.7676127, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79072857, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.5878918170928955 + }, + { + "auxiliary_loss_clip": 0.0118607, + "auxiliary_loss_mlp": 0.00749436, + "balance_loss_clip": 1.00219989, + "balance_loss_mlp": 1.00049531, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 2.3652800361372184, + "language_loss": 0.83436739, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85372251, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.5266785621643066 + }, + { + "auxiliary_loss_clip": 0.01123324, + "auxiliary_loss_mlp": 0.01173735, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00121498, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 5.4883499835664615, + "language_loss": 0.84049809, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86346871, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.6208555698394775 + }, + { + "auxiliary_loss_clip": 0.01137342, + "auxiliary_loss_mlp": 0.01173572, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00095689, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 2.03443476405724, + "language_loss": 0.81335282, + "learning_rate": 3.959672139580233e-06, + "loss": 0.8364619, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.650569438934326 + }, + { + "auxiliary_loss_clip": 0.01153457, + "auxiliary_loss_mlp": 0.01173656, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00104117, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.6347338900221735, + "language_loss": 0.83995241, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.86322355, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.633270502090454 + }, + { + "auxiliary_loss_clip": 0.01136207, + "auxiliary_loss_mlp": 0.01173922, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00102079, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 2.5512219563589116, + "language_loss": 0.90045851, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92355984, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 4.006886959075928 + }, + { + "auxiliary_loss_clip": 0.01137465, + "auxiliary_loss_mlp": 0.01174001, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00129044, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.237453042556333, + "language_loss": 0.75842834, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78154296, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 4.104366779327393 + }, + { + "auxiliary_loss_clip": 0.01169973, + "auxiliary_loss_mlp": 0.01173527, + "balance_loss_clip": 1.00216806, + "balance_loss_mlp": 1.00119817, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.748660776759213, + "language_loss": 0.8170954, + "learning_rate": 3.959360282528346e-06, + "loss": 0.8405304, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.530818223953247 + }, + { + "auxiliary_loss_clip": 0.01185917, + "auxiliary_loss_mlp": 0.01173674, + "balance_loss_clip": 1.00217938, + "balance_loss_mlp": 1.00115418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 3.943270155589145, + "language_loss": 0.89237207, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91596794, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.4962339401245117 + }, + { + "auxiliary_loss_clip": 0.01153232, + "auxiliary_loss_mlp": 0.01173844, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00113368, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.728379723268502, + "language_loss": 0.80857432, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83184505, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 5.442292213439941 + }, + { + "auxiliary_loss_clip": 0.01152601, + "auxiliary_loss_mlp": 0.01170622, + "balance_loss_clip": 1.00327897, + "balance_loss_mlp": 1.00048661, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.777611813101709, + "language_loss": 0.57307386, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59630609, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.262042760848999 + }, + { + "auxiliary_loss_clip": 0.01152511, + "auxiliary_loss_mlp": 0.01173712, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00128746, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 3.0707983090773636, + "language_loss": 0.67461753, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69787973, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.5415799617767334 + }, + { + "auxiliary_loss_clip": 0.01137496, + "auxiliary_loss_mlp": 0.01173388, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00096345, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.7427928885803115, + "language_loss": 0.8394748, + "learning_rate": 3.958968789505198e-06, + "loss": 0.8625837, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.6120035648345947 + }, + { + "auxiliary_loss_clip": 0.01186596, + "auxiliary_loss_mlp": 0.01170336, + "balance_loss_clip": 1.00365973, + "balance_loss_mlp": 1.00020087, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8797018449828712, + "language_loss": 0.61899269, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64256203, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.0804662704467773 + }, + { + "auxiliary_loss_clip": 0.01152975, + "auxiliary_loss_mlp": 0.01173692, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.00126779, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.9024377550228644, + "language_loss": 0.82753086, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85079753, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.611980676651001 + }, + { + "auxiliary_loss_clip": 0.01137109, + "auxiliary_loss_mlp": 0.01173711, + "balance_loss_clip": 1.00201643, + "balance_loss_mlp": 1.00138199, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.982667945726004, + "language_loss": 0.72272491, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74583304, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 2.9216179847717285 + }, + { + "auxiliary_loss_clip": 0.01169979, + "auxiliary_loss_mlp": 0.0117375, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 1.00094485, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.6481844278852424, + "language_loss": 0.77502632, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79846358, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.6274526119232178 + }, + { + "auxiliary_loss_clip": 0.01135941, + "auxiliary_loss_mlp": 0.01173782, + "balance_loss_clip": 1.00199509, + "balance_loss_mlp": 1.00107169, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 4.036720537503931, + "language_loss": 0.74290264, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76599991, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.584510326385498 + }, + { + "auxiliary_loss_clip": 0.01170203, + "auxiliary_loss_mlp": 0.01173587, + "balance_loss_clip": 1.00217235, + "balance_loss_mlp": 1.00097227, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 1.8264574301609735, + "language_loss": 0.84729302, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.870731, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.5582034587860107 + }, + { + "auxiliary_loss_clip": 0.01137479, + "auxiliary_loss_mlp": 0.01173618, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00119376, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.4432764562273404, + "language_loss": 0.6740796, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69719052, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.6662161350250244 + }, + { + "auxiliary_loss_clip": 0.01104558, + "auxiliary_loss_mlp": 0.01173818, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00110769, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.8017835866982193, + "language_loss": 0.8357712, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85855496, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.683039665222168 + }, + { + "auxiliary_loss_clip": 0.01169477, + "auxiliary_loss_mlp": 0.01173572, + "balance_loss_clip": 1.00238037, + "balance_loss_mlp": 1.0010525, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.76185136520961, + "language_loss": 0.75892884, + "learning_rate": 3.958259422403966e-06, + "loss": 0.7823593, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.5918843746185303 + }, + { + "auxiliary_loss_clip": 0.01137203, + "auxiliary_loss_mlp": 0.01173797, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00118232, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.2532802226937227, + "language_loss": 0.83046114, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85357118, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.6855242252349854 + }, + { + "auxiliary_loss_clip": 0.0117039, + "auxiliary_loss_mlp": 0.00749835, + "balance_loss_clip": 1.00346279, + "balance_loss_mlp": 1.00127161, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7432331608874351, + "language_loss": 0.61790109, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63710332, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.279387950897217 + }, + { + "auxiliary_loss_clip": 0.01170182, + "auxiliary_loss_mlp": 0.01170571, + "balance_loss_clip": 1.00411129, + "balance_loss_mlp": 1.00043571, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8237667612454638, + "language_loss": 0.58922887, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61263645, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.2627551555633545 + }, + { + "auxiliary_loss_clip": 0.01139848, + "auxiliary_loss_mlp": 0.01173865, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00115418, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 1.7329825089134672, + "language_loss": 0.87904382, + "learning_rate": 3.957942217314823e-06, + "loss": 0.90218097, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.6462671756744385 + }, + { + "auxiliary_loss_clip": 0.011534, + "auxiliary_loss_mlp": 0.0117382, + "balance_loss_clip": 1.00215161, + "balance_loss_mlp": 1.00149095, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 3.502148720406618, + "language_loss": 0.81426787, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83754003, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.612083673477173 + }, + { + "auxiliary_loss_clip": 0.01170036, + "auxiliary_loss_mlp": 0.01170484, + "balance_loss_clip": 1.00369453, + "balance_loss_mlp": 1.00034893, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8748917466149055, + "language_loss": 0.59693384, + "learning_rate": 3.957783169286024e-06, + "loss": 0.6203391, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.1395037174224854 + }, + { + "auxiliary_loss_clip": 0.01170011, + "auxiliary_loss_mlp": 0.01173675, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.00134635, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5940350530738148, + "language_loss": 0.84343392, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86687076, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.6703083515167236 + }, + { + "auxiliary_loss_clip": 0.01087705, + "auxiliary_loss_mlp": 0.01173701, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00137234, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 3.5064520994713577, + "language_loss": 0.78071868, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80333275, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.793189287185669 + }, + { + "auxiliary_loss_clip": 0.01152685, + "auxiliary_loss_mlp": 0.01173707, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.0011878, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.374464776173747, + "language_loss": 0.80411947, + "learning_rate": 3.957544040455379e-06, + "loss": 0.8273834, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.5724992752075195 + }, + { + "auxiliary_loss_clip": 0.01136605, + "auxiliary_loss_mlp": 0.01173918, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.0013032, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 3.391779137494673, + "language_loss": 0.76432377, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78742903, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.6278345584869385 + }, + { + "auxiliary_loss_clip": 0.01136906, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00119138, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.805038490393852, + "language_loss": 0.80996269, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83306694, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.643301248550415 + }, + { + "auxiliary_loss_clip": 0.01169159, + "auxiliary_loss_mlp": 0.01173144, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.00081491, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.6906623908545866, + "language_loss": 0.61336684, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63678992, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.6331305503845215 + }, + { + "auxiliary_loss_clip": 0.01153157, + "auxiliary_loss_mlp": 0.01173644, + "balance_loss_clip": 1.00219274, + "balance_loss_mlp": 1.00141072, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.2916508633466903, + "language_loss": 0.8514806, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87474859, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.5663564205169678 + }, + { + "auxiliary_loss_clip": 0.01152855, + "auxiliary_loss_mlp": 0.01173429, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00109994, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 2.159285795710488, + "language_loss": 0.76159954, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78486246, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.5864508152008057 + }, + { + "auxiliary_loss_clip": 0.01156523, + "auxiliary_loss_mlp": 0.0117365, + "balance_loss_clip": 1.00239098, + "balance_loss_mlp": 1.00141633, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0903137851645117, + "language_loss": 0.79961562, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82291734, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.598973274230957 + }, + { + "auxiliary_loss_clip": 0.0115248, + "auxiliary_loss_mlp": 0.01173807, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00147867, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8588827610480032, + "language_loss": 0.75238502, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77564788, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.554203510284424 + }, + { + "auxiliary_loss_clip": 0.01135978, + "auxiliary_loss_mlp": 0.00749445, + "balance_loss_clip": 1.00193489, + "balance_loss_mlp": 1.00049663, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.2705274031864198, + "language_loss": 0.78161776, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80047196, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.5999536514282227 + }, + { + "auxiliary_loss_clip": 0.0115346, + "auxiliary_loss_mlp": 0.01173584, + "balance_loss_clip": 1.00226331, + "balance_loss_mlp": 1.00154138, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.7635709253742922, + "language_loss": 0.82263422, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84590465, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.6122865676879883 + }, + { + "auxiliary_loss_clip": 0.01185911, + "auxiliary_loss_mlp": 0.01173264, + "balance_loss_clip": 1.0023073, + "balance_loss_mlp": 1.00093472, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 2.0686149089295918, + "language_loss": 0.76447582, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78806752, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.497372627258301 + }, + { + "auxiliary_loss_clip": 0.01120196, + "auxiliary_loss_mlp": 0.01173675, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00134563, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.409020316047251, + "language_loss": 0.85472536, + "learning_rate": 3.956661519635756e-06, + "loss": 0.87766409, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.6280200481414795 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01173422, + "balance_loss_clip": 1.00221789, + "balance_loss_mlp": 1.00109291, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.9619221916611473, + "language_loss": 0.76867998, + "learning_rate": 3.95658084522853e-06, + "loss": 0.7916227, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.7412257194519043 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01173416, + "balance_loss_clip": 1.00198114, + "balance_loss_mlp": 1.00137269, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.6196849993115465, + "language_loss": 0.79338837, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81633908, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 5.529139518737793 + }, + { + "auxiliary_loss_clip": 0.01137087, + "auxiliary_loss_mlp": 0.01173389, + "balance_loss_clip": 1.00216746, + "balance_loss_mlp": 1.0012511, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 2.063698325522414, + "language_loss": 0.87698364, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90008843, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.6583526134490967 + }, + { + "auxiliary_loss_clip": 0.01152567, + "auxiliary_loss_mlp": 0.01173784, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00155067, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.2693068911045864, + "language_loss": 0.82020831, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84347183, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.654019355773926 + }, + { + "auxiliary_loss_clip": 0.01155701, + "auxiliary_loss_mlp": 0.01173439, + "balance_loss_clip": 1.00234771, + "balance_loss_mlp": 1.00149107, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 1.932795978688171, + "language_loss": 0.81060344, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83389485, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.5898633003234863 + }, + { + "auxiliary_loss_clip": 0.0118572, + "auxiliary_loss_mlp": 0.01173632, + "balance_loss_clip": 1.00226879, + "balance_loss_mlp": 1.00158906, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.297785374132077, + "language_loss": 0.86981159, + "learning_rate": 3.956176360347553e-06, + "loss": 0.89340514, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 5.379546165466309 + }, + { + "auxiliary_loss_clip": 0.01153869, + "auxiliary_loss_mlp": 0.0117019, + "balance_loss_clip": 1.00310898, + "balance_loss_mlp": 1.00081718, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9819206180509289, + "language_loss": 0.65855116, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68179172, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.135218620300293 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01173391, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00115716, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 2.249813047975925, + "language_loss": 0.79755139, + "learning_rate": 3.956014047124844e-06, + "loss": 0.82064795, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.6180615425109863 + }, + { + "auxiliary_loss_clip": 0.01185728, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_clip": 1.0022167, + "balance_loss_mlp": 1.00145578, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 2.753630076780803, + "language_loss": 0.78250813, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80610043, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.5536887645721436 + }, + { + "auxiliary_loss_clip": 0.01137378, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00141621, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.6324087621115853, + "language_loss": 0.73547983, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75858814, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.6369378566741943 + }, + { + "auxiliary_loss_clip": 0.01151948, + "auxiliary_loss_mlp": 0.01172993, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00114119, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.776296607610414, + "language_loss": 0.77867466, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80192405, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.6822099685668945 + }, + { + "auxiliary_loss_clip": 0.01139447, + "auxiliary_loss_mlp": 0.01172876, + "balance_loss_clip": 1.00233483, + "balance_loss_mlp": 1.00111878, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.529358510545203, + "language_loss": 0.87067819, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89380145, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.6059465408325195 + }, + { + "auxiliary_loss_clip": 0.01169467, + "auxiliary_loss_mlp": 0.01173572, + "balance_loss_clip": 1.00222254, + "balance_loss_mlp": 1.00143337, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 2.3322152816338493, + "language_loss": 0.67238522, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69581568, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.514756441116333 + }, + { + "auxiliary_loss_clip": 0.01169111, + "auxiliary_loss_mlp": 0.01173524, + "balance_loss_clip": 1.00225675, + "balance_loss_mlp": 1.00129032, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 5.524934609099181, + "language_loss": 0.70576298, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72918928, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.577658176422119 + }, + { + "auxiliary_loss_clip": 0.01120281, + "auxiliary_loss_mlp": 0.01173186, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00123858, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.526104190651695, + "language_loss": 0.81052375, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83345842, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.6974480152130127 + }, + { + "auxiliary_loss_clip": 0.01153523, + "auxiliary_loss_mlp": 0.0117354, + "balance_loss_clip": 1.00217509, + "balance_loss_mlp": 1.00121069, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 4.3675306233260205, + "language_loss": 0.72328126, + "learning_rate": 3.955361827590961e-06, + "loss": 0.74655187, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.615294933319092 + }, + { + "auxiliary_loss_clip": 0.01135854, + "auxiliary_loss_mlp": 0.01169688, + "balance_loss_clip": 1.00248373, + "balance_loss_mlp": 1.00031555, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 5.143722999980717, + "language_loss": 0.55356002, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57661545, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 3.0103657245635986 + }, + { + "auxiliary_loss_clip": 0.01137354, + "auxiliary_loss_mlp": 0.01173369, + "balance_loss_clip": 1.00221682, + "balance_loss_mlp": 1.00132656, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.7885942609182635, + "language_loss": 0.8127085, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83581567, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.6810569763183594 + }, + { + "auxiliary_loss_clip": 0.01123258, + "auxiliary_loss_mlp": 0.01173401, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.00126302, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.6535529607401942, + "language_loss": 0.81509089, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83805752, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.6705188751220703 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.00749497, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00052357, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 2.1626098480128415, + "language_loss": 0.65090752, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66960341, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.8549916744232178 + }, + { + "auxiliary_loss_clip": 0.01136123, + "auxiliary_loss_mlp": 0.01173325, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00137806, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 2.3716808123410242, + "language_loss": 0.83267605, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85577059, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.7105536460876465 + }, + { + "auxiliary_loss_clip": 0.01152724, + "auxiliary_loss_mlp": 0.01173819, + "balance_loss_clip": 1.00223494, + "balance_loss_mlp": 1.00139499, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.581510298750314, + "language_loss": 0.74203342, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76529884, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.557490110397339 + }, + { + "auxiliary_loss_clip": 0.01169504, + "auxiliary_loss_mlp": 0.01173043, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00119042, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.917316001490983, + "language_loss": 0.74288589, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76631135, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.616349697113037 + }, + { + "auxiliary_loss_clip": 0.01169774, + "auxiliary_loss_mlp": 0.01173323, + "balance_loss_clip": 1.00224388, + "balance_loss_mlp": 1.0014708, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.9676333094103506, + "language_loss": 0.69866359, + "learning_rate": 3.954704862616971e-06, + "loss": 0.7220946, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.5530190467834473 + }, + { + "auxiliary_loss_clip": 0.01168598, + "auxiliary_loss_mlp": 0.0117336, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00141227, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 6.807955223330469, + "language_loss": 0.83015609, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85357571, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.5396616458892822 + }, + { + "auxiliary_loss_clip": 0.01153408, + "auxiliary_loss_mlp": 0.01172894, + "balance_loss_clip": 1.00223911, + "balance_loss_mlp": 1.00104165, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.3104834583380267, + "language_loss": 0.84442997, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86769295, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.5909807682037354 + }, + { + "auxiliary_loss_clip": 0.01169747, + "auxiliary_loss_mlp": 0.01173541, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00130773, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.8926777468127323, + "language_loss": 0.68971783, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71315074, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.7144198417663574 + }, + { + "auxiliary_loss_clip": 0.01169597, + "auxiliary_loss_mlp": 0.00749521, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00053215, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.652879434048323, + "language_loss": 0.74909651, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76828766, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.6205461025238037 + }, + { + "auxiliary_loss_clip": 0.01168991, + "auxiliary_loss_mlp": 0.01173499, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00126529, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.9443422745156889, + "language_loss": 0.69217074, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71559566, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.6381375789642334 + }, + { + "auxiliary_loss_clip": 0.01136864, + "auxiliary_loss_mlp": 0.01173323, + "balance_loss_clip": 1.00202644, + "balance_loss_mlp": 1.00137556, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.1658501348607464, + "language_loss": 0.83674514, + "learning_rate": 3.954209025650093e-06, + "loss": 0.85984707, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.6212430000305176 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01173242, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.0013895, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 2.2839328516654795, + "language_loss": 0.8054449, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82870448, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.5462448596954346 + }, + { + "auxiliary_loss_clip": 0.01169814, + "auxiliary_loss_mlp": 0.01173479, + "balance_loss_clip": 1.00218129, + "balance_loss_mlp": 1.0012449, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 4.818051020611921, + "language_loss": 0.83075029, + "learning_rate": 3.954043153797251e-06, + "loss": 0.8541832, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.532409429550171 + }, + { + "auxiliary_loss_clip": 0.01136883, + "auxiliary_loss_mlp": 0.01173009, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00115728, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 3.185201024284468, + "language_loss": 0.6276269, + "learning_rate": 3.953960106722989e-06, + "loss": 0.65072584, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.637962579727173 + }, + { + "auxiliary_loss_clip": 0.0118566, + "auxiliary_loss_mlp": 0.01173299, + "balance_loss_clip": 1.00230169, + "balance_loss_mlp": 1.0011611, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.2285012089129754, + "language_loss": 0.71228826, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73587793, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.540252208709717 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01173416, + "balance_loss_clip": 1.00224447, + "balance_loss_mlp": 1.00156367, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 11.990373188856926, + "language_loss": 0.79878664, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82221347, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.623466730117798 + }, + { + "auxiliary_loss_clip": 0.0115245, + "auxiliary_loss_mlp": 0.01172902, + "balance_loss_clip": 1.00189459, + "balance_loss_mlp": 1.00104952, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 2.424073055396197, + "language_loss": 0.74104881, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76430231, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.6001927852630615 + }, + { + "auxiliary_loss_clip": 0.01169144, + "auxiliary_loss_mlp": 0.01173091, + "balance_loss_clip": 1.00202179, + "balance_loss_mlp": 1.00133443, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.8017113531175966, + "language_loss": 0.75570321, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77912557, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 2.5300562381744385 + }, + { + "auxiliary_loss_clip": 0.01136929, + "auxiliary_loss_mlp": 0.01173242, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00148511, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.9225755855439597, + "language_loss": 0.86841708, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89151877, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.580786943435669 + }, + { + "auxiliary_loss_clip": 0.01103636, + "auxiliary_loss_mlp": 0.01173777, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00144804, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.454199650236614, + "language_loss": 0.71354604, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73632014, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 4.275251626968384 + }, + { + "auxiliary_loss_clip": 0.01135344, + "auxiliary_loss_mlp": 0.01173674, + "balance_loss_clip": 1.0019356, + "balance_loss_mlp": 1.00163126, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.1688476691159395, + "language_loss": 0.84941745, + "learning_rate": 3.953376702737693e-06, + "loss": 0.87250763, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 4.071742534637451 + }, + { + "auxiliary_loss_clip": 0.01152442, + "auxiliary_loss_mlp": 0.01173108, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00135136, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.9850160978545612, + "language_loss": 0.6742782, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69753373, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.5816502571105957 + }, + { + "auxiliary_loss_clip": 0.01120957, + "auxiliary_loss_mlp": 0.01173321, + "balance_loss_clip": 1.0019778, + "balance_loss_mlp": 1.00137329, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.9852583487731643, + "language_loss": 0.81230819, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83525091, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 2.6563467979431152 + }, + { + "auxiliary_loss_clip": 0.01169805, + "auxiliary_loss_mlp": 0.01173738, + "balance_loss_clip": 1.00235891, + "balance_loss_mlp": 1.00159979, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.5279934498165937, + "language_loss": 0.80936921, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83280468, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 3.9064691066741943 + }, + { + "auxiliary_loss_clip": 0.01138931, + "auxiliary_loss_mlp": 0.01173026, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.00117397, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.9630841134388428, + "language_loss": 0.84478211, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86790168, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 4.040278911590576 + }, + { + "auxiliary_loss_clip": 0.01168683, + "auxiliary_loss_mlp": 0.00749598, + "balance_loss_clip": 1.00295842, + "balance_loss_mlp": 1.00082958, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7007401623669293, + "language_loss": 0.54681331, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56599617, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.088515520095825 + }, + { + "auxiliary_loss_clip": 0.01120408, + "auxiliary_loss_mlp": 0.01169431, + "balance_loss_clip": 1.00288951, + "balance_loss_mlp": 1.00005817, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7542136383879982, + "language_loss": 0.58185124, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60474956, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.367457866668701 + }, + { + "auxiliary_loss_clip": 0.01153964, + "auxiliary_loss_mlp": 0.01173183, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00142622, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.9235294941685515, + "language_loss": 0.6930542, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71632564, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.6295907497406006 + }, + { + "auxiliary_loss_clip": 0.01136738, + "auxiliary_loss_mlp": 0.01173093, + "balance_loss_clip": 1.00201356, + "balance_loss_mlp": 1.00124097, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.9824307945567834, + "language_loss": 0.80435348, + "learning_rate": 3.952705511055698e-06, + "loss": 0.82745183, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.6838767528533936 + }, + { + "auxiliary_loss_clip": 0.0115299, + "auxiliary_loss_mlp": 0.0117284, + "balance_loss_clip": 1.0020951, + "balance_loss_mlp": 1.00127435, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.7584237619486442, + "language_loss": 0.93178588, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95504415, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.6836509704589844 + }, + { + "auxiliary_loss_clip": 0.01169513, + "auxiliary_loss_mlp": 0.01172779, + "balance_loss_clip": 1.00228596, + "balance_loss_mlp": 1.00140357, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.252192197083057, + "language_loss": 0.88811183, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.91153479, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.622692823410034 + }, + { + "auxiliary_loss_clip": 0.01153533, + "auxiliary_loss_mlp": 0.0117373, + "balance_loss_clip": 1.00227904, + "balance_loss_mlp": 1.00159192, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 2.065280068228765, + "language_loss": 0.77038109, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79365373, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.598914623260498 + }, + { + "auxiliary_loss_clip": 0.01136852, + "auxiliary_loss_mlp": 0.01173024, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00136209, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 3.4801782381643673, + "language_loss": 0.77814162, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80124038, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.5867843627929688 + }, + { + "auxiliary_loss_clip": 0.01153162, + "auxiliary_loss_mlp": 0.01172995, + "balance_loss_clip": 1.00221872, + "balance_loss_mlp": 1.00114322, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 2.148927672202063, + "language_loss": 0.85641456, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.8796761, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.6424472332000732 + }, + { + "auxiliary_loss_clip": 0.01169275, + "auxiliary_loss_mlp": 0.01173098, + "balance_loss_clip": 1.0022589, + "balance_loss_mlp": 1.00143719, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 3.068751794472714, + "language_loss": 0.80253202, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82595575, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.519294023513794 + }, + { + "auxiliary_loss_clip": 0.01169496, + "auxiliary_loss_mlp": 0.01172671, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00110543, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.469326622519831, + "language_loss": 0.86233801, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88575971, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.5066678524017334 + }, + { + "auxiliary_loss_clip": 0.01169637, + "auxiliary_loss_mlp": 0.01172959, + "balance_loss_clip": 1.00227356, + "balance_loss_mlp": 1.00129747, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.260586046053524, + "language_loss": 0.85331285, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87673879, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.5424532890319824 + }, + { + "auxiliary_loss_clip": 0.01169716, + "auxiliary_loss_mlp": 0.00749433, + "balance_loss_clip": 1.00219321, + "balance_loss_mlp": 1.00040579, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 2.2795286528794967, + "language_loss": 0.83547717, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85466868, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.5791945457458496 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01173093, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00143123, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 2.337650714474068, + "language_loss": 0.84420335, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86762297, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.53342604637146 + }, + { + "auxiliary_loss_clip": 0.01169246, + "auxiliary_loss_mlp": 0.01173014, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00125766, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.5746458868505468, + "language_loss": 0.75608838, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77951097, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.592991828918457 + }, + { + "auxiliary_loss_clip": 0.01119283, + "auxiliary_loss_mlp": 0.0117287, + "balance_loss_clip": 1.00206959, + "balance_loss_mlp": 1.00139952, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.7123730004848277, + "language_loss": 0.78160298, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80452454, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.703742265701294 + }, + { + "auxiliary_loss_clip": 0.01152845, + "auxiliary_loss_mlp": 0.01173226, + "balance_loss_clip": 1.00222421, + "balance_loss_mlp": 1.0014689, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 2.070104885606911, + "language_loss": 0.86406922, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88732988, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.55196213722229 + }, + { + "auxiliary_loss_clip": 0.01153322, + "auxiliary_loss_mlp": 0.01173264, + "balance_loss_clip": 1.00215602, + "balance_loss_mlp": 1.00150704, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.1371339868742556, + "language_loss": 0.83197749, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85524333, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.6030964851379395 + }, + { + "auxiliary_loss_clip": 0.01136043, + "auxiliary_loss_mlp": 0.01173187, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00152552, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.6339933881107715, + "language_loss": 0.78547132, + "learning_rate": 3.951434254872751e-06, + "loss": 0.80856359, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.620500326156616 + }, + { + "auxiliary_loss_clip": 0.0116934, + "auxiliary_loss_mlp": 0.01173012, + "balance_loss_clip": 1.00217307, + "balance_loss_mlp": 1.00144577, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.273189432739832, + "language_loss": 0.7317965, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.5343942642211914 + }, + { + "auxiliary_loss_clip": 0.01152361, + "auxiliary_loss_mlp": 0.01172915, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00144482, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 4.32350997656586, + "language_loss": 0.72744, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75069284, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.58774733543396 + }, + { + "auxiliary_loss_clip": 0.01152885, + "auxiliary_loss_mlp": 0.01173001, + "balance_loss_clip": 1.00222838, + "balance_loss_mlp": 1.00143492, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.9621066534982907, + "language_loss": 0.78148818, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80474699, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.6489484310150146 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01172891, + "balance_loss_clip": 1.0024699, + "balance_loss_mlp": 1.00122976, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.0233566555839815, + "language_loss": 0.70120406, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72448933, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.5820977687835693 + }, + { + "auxiliary_loss_clip": 0.01185516, + "auxiliary_loss_mlp": 0.01173001, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00162578, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 4.5474644457013715, + "language_loss": 0.77514577, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79873097, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.4985992908477783 + }, + { + "auxiliary_loss_clip": 0.01135478, + "auxiliary_loss_mlp": 0.01172674, + "balance_loss_clip": 1.00206804, + "balance_loss_mlp": 1.00120354, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.3996850818351396, + "language_loss": 0.72768563, + "learning_rate": 3.950921089880003e-06, + "loss": 0.75076711, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.6799464225769043 + }, + { + "auxiliary_loss_clip": 0.01168408, + "auxiliary_loss_mlp": 0.01172608, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00104165, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 2.107110707315841, + "language_loss": 0.89084458, + "learning_rate": 3.950835303435337e-06, + "loss": 0.91425472, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.5291075706481934 + }, + { + "auxiliary_loss_clip": 0.01172064, + "auxiliary_loss_mlp": 0.01172591, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00092936, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.9981206231168651, + "language_loss": 0.80984032, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83328694, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.5373690128326416 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01172801, + "balance_loss_clip": 1.00207579, + "balance_loss_mlp": 1.00113916, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 4.048110918595666, + "language_loss": 0.86287916, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88629425, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.4805941581726074 + }, + { + "auxiliary_loss_clip": 0.01135902, + "auxiliary_loss_mlp": 0.0117295, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00128901, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7021262230186542, + "language_loss": 0.80814487, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83123338, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.665945529937744 + }, + { + "auxiliary_loss_clip": 0.01172058, + "auxiliary_loss_mlp": 0.01172994, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00142813, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 2.0156527304810012, + "language_loss": 0.82465398, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84810448, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.507927179336548 + }, + { + "auxiliary_loss_clip": 0.01169209, + "auxiliary_loss_mlp": 0.00749301, + "balance_loss_clip": 1.00223804, + "balance_loss_mlp": 1.00032949, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 2.736826278389978, + "language_loss": 0.68288398, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70206904, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.5345406532287598 + }, + { + "auxiliary_loss_clip": 0.01152738, + "auxiliary_loss_mlp": 0.01168929, + "balance_loss_clip": 1.00322294, + "balance_loss_mlp": 1.00031948, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.8519616698923792, + "language_loss": 0.60804152, + "learning_rate": 3.950319031388119e-06, + "loss": 0.63125825, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.0985941886901855 + }, + { + "auxiliary_loss_clip": 0.01139014, + "auxiliary_loss_mlp": 0.01172754, + "balance_loss_clip": 1.0020628, + "balance_loss_mlp": 1.00128317, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.026786405837622, + "language_loss": 0.73316193, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75627959, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 5.566009044647217 + }, + { + "auxiliary_loss_clip": 0.01136998, + "auxiliary_loss_mlp": 0.01172834, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00145912, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.9240821400746115, + "language_loss": 0.84526825, + "learning_rate": 3.950146349020525e-06, + "loss": 0.8683666, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.629934310913086 + }, + { + "auxiliary_loss_clip": 0.01169595, + "auxiliary_loss_mlp": 0.01169869, + "balance_loss_clip": 1.00349426, + "balance_loss_mlp": 1.00049615, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7457296068672967, + "language_loss": 0.55662429, + "learning_rate": 3.950059896910473e-06, + "loss": 0.58001888, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.04141902923584 + }, + { + "auxiliary_loss_clip": 0.01168671, + "auxiliary_loss_mlp": 0.01172085, + "balance_loss_clip": 1.00210631, + "balance_loss_mlp": 1.00080466, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.594680193343813, + "language_loss": 0.89942896, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92283654, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 4.09164834022522 + }, + { + "auxiliary_loss_clip": 0.01119827, + "auxiliary_loss_mlp": 0.00749435, + "balance_loss_clip": 1.00344658, + "balance_loss_mlp": 1.00050378, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.7987281181890734, + "language_loss": 0.63698435, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65567696, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.3871564865112305 + }, + { + "auxiliary_loss_clip": 0.01168558, + "auxiliary_loss_mlp": 0.01172294, + "balance_loss_clip": 1.0021373, + "balance_loss_mlp": 1.00120521, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 2.2044656239851377, + "language_loss": 0.88165212, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90506059, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 3.9462833404541016 + }, + { + "auxiliary_loss_clip": 0.01151926, + "auxiliary_loss_mlp": 0.01172258, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.00116849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 2.1973172855926872, + "language_loss": 0.82174611, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84498787, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.5769219398498535 + }, + { + "auxiliary_loss_clip": 0.01168767, + "auxiliary_loss_mlp": 0.00749338, + "balance_loss_clip": 1.00220656, + "balance_loss_mlp": 1.00042391, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.645617367908247, + "language_loss": 0.79581916, + "learning_rate": 3.949626527228875e-06, + "loss": 0.8150003, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.5900096893310547 + }, + { + "auxiliary_loss_clip": 0.01185177, + "auxiliary_loss_mlp": 0.0117219, + "balance_loss_clip": 1.00236964, + "balance_loss_mlp": 1.00129199, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.8872573609563201, + "language_loss": 0.81154704, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83512074, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.503328323364258 + }, + { + "auxiliary_loss_clip": 0.01185147, + "auxiliary_loss_mlp": 0.01172316, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.00113177, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9274178625645366, + "language_loss": 0.80795181, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.8315264, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.5366945266723633 + }, + { + "auxiliary_loss_clip": 0.01168368, + "auxiliary_loss_mlp": 0.01172636, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00154686, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.8365726025249491, + "language_loss": 0.88867038, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91208047, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.527834177017212 + }, + { + "auxiliary_loss_clip": 0.0115194, + "auxiliary_loss_mlp": 0.01172768, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.0013926, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 8.677373851560473, + "language_loss": 0.84935528, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87260234, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.5600035190582275 + }, + { + "auxiliary_loss_clip": 0.01185599, + "auxiliary_loss_mlp": 0.0116916, + "balance_loss_clip": 1.00344038, + "balance_loss_mlp": 1.00055063, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9055086124463781, + "language_loss": 0.607301, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63084865, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.110874652862549 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01172106, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00120711, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8344409761874747, + "language_loss": 0.85298485, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87623322, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.593879461288452 + }, + { + "auxiliary_loss_clip": 0.01152524, + "auxiliary_loss_mlp": 0.01172516, + "balance_loss_clip": 1.00233698, + "balance_loss_mlp": 1.00142717, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.7347315651977766, + "language_loss": 0.79854441, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82179481, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.549015522003174 + }, + { + "auxiliary_loss_clip": 0.01153095, + "auxiliary_loss_mlp": 0.01172351, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00126159, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.6675167275362277, + "language_loss": 0.83622068, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85947508, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.636162281036377 + }, + { + "auxiliary_loss_clip": 0.01153443, + "auxiliary_loss_mlp": 0.01172116, + "balance_loss_clip": 1.00208402, + "balance_loss_mlp": 1.00131309, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.4494853229710785, + "language_loss": 0.89738512, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.92064071, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.550891876220703 + }, + { + "auxiliary_loss_clip": 0.01169062, + "auxiliary_loss_mlp": 0.01172477, + "balance_loss_clip": 1.00227094, + "balance_loss_mlp": 1.00110173, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.9418601282048717, + "language_loss": 0.70210433, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72551978, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.592547655105591 + }, + { + "auxiliary_loss_clip": 0.01135613, + "auxiliary_loss_mlp": 0.01172366, + "balance_loss_clip": 1.00191224, + "balance_loss_mlp": 1.001086, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 3.0873271055461973, + "language_loss": 0.78872067, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81180048, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.5750300884246826 + }, + { + "auxiliary_loss_clip": 0.0116883, + "auxiliary_loss_mlp": 0.01172415, + "balance_loss_clip": 1.00238681, + "balance_loss_mlp": 1.00142133, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.857105281700678, + "language_loss": 0.70202124, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72543371, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.595931053161621 + }, + { + "auxiliary_loss_clip": 0.01089728, + "auxiliary_loss_mlp": 0.01173118, + "balance_loss_clip": 1.00210452, + "balance_loss_mlp": 1.00174308, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8129461585404905, + "language_loss": 0.78969586, + "learning_rate": 3.948491117273956e-06, + "loss": 0.81232429, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.703942060470581 + }, + { + "auxiliary_loss_clip": 0.01152532, + "auxiliary_loss_mlp": 0.0117216, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00126147, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.5881989705638277, + "language_loss": 0.77079153, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79403847, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.6352505683898926 + }, + { + "auxiliary_loss_clip": 0.01185207, + "auxiliary_loss_mlp": 0.01172471, + "balance_loss_clip": 1.00233245, + "balance_loss_mlp": 1.00138164, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 2.1358732457461147, + "language_loss": 0.78113639, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80471313, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.5530319213867188 + }, + { + "auxiliary_loss_clip": 0.01185302, + "auxiliary_loss_mlp": 0.0117289, + "balance_loss_clip": 1.00232184, + "balance_loss_mlp": 1.00170517, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.373859832764612, + "language_loss": 0.85282099, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87640297, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.5438082218170166 + }, + { + "auxiliary_loss_clip": 0.01184985, + "auxiliary_loss_mlp": 0.01172093, + "balance_loss_clip": 1.00220668, + "balance_loss_mlp": 1.00129032, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 2.133174595620066, + "language_loss": 0.76836395, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79193473, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.5539956092834473 + }, + { + "auxiliary_loss_clip": 0.01169335, + "auxiliary_loss_mlp": 0.01168545, + "balance_loss_clip": 1.00323343, + "balance_loss_mlp": 0.99993497, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.774097451185223, + "language_loss": 0.60762638, + "learning_rate": 3.948051095825149e-06, + "loss": 0.63100517, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.123142957687378 + }, + { + "auxiliary_loss_clip": 0.01135673, + "auxiliary_loss_mlp": 0.01172551, + "balance_loss_clip": 1.00195205, + "balance_loss_mlp": 1.00136638, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 4.174143350865454, + "language_loss": 0.77272534, + "learning_rate": 3.947962869911147e-06, + "loss": 0.7958076, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.6192078590393066 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01172347, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.00135326, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.311458190195904, + "language_loss": 0.73282152, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75590569, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.620448112487793 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.00749355, + "balance_loss_clip": 1.00209844, + "balance_loss_mlp": 1.00042844, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 1.945236382657882, + "language_loss": 0.79701489, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81619322, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.581166982650757 + }, + { + "auxiliary_loss_clip": 0.01185306, + "auxiliary_loss_mlp": 0.01172702, + "balance_loss_clip": 1.00235748, + "balance_loss_mlp": 1.00161266, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 3.7478344015771348, + "language_loss": 0.8148725, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83845258, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.5154755115509033 + }, + { + "auxiliary_loss_clip": 0.01168697, + "auxiliary_loss_mlp": 0.01172779, + "balance_loss_clip": 1.00227642, + "balance_loss_mlp": 1.00140405, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.6609548092917508, + "language_loss": 0.85974014, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88315487, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.529193639755249 + }, + { + "auxiliary_loss_clip": 0.01152569, + "auxiliary_loss_mlp": 0.01172293, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00139499, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.0728418983409664, + "language_loss": 0.86140335, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88465196, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.537785053253174 + }, + { + "auxiliary_loss_clip": 0.01155471, + "auxiliary_loss_mlp": 0.01172368, + "balance_loss_clip": 1.00233817, + "balance_loss_mlp": 1.00146973, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.0031468714863347, + "language_loss": 0.90110248, + "learning_rate": 3.947431963338532e-06, + "loss": 0.9243809, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.5978779792785645 + }, + { + "auxiliary_loss_clip": 0.0118518, + "auxiliary_loss_mlp": 0.01168487, + "balance_loss_clip": 1.00315475, + "balance_loss_mlp": 0.99987698, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7777846871796301, + "language_loss": 0.52960646, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55314314, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.1542160511016846 + }, + { + "auxiliary_loss_clip": 0.01185178, + "auxiliary_loss_mlp": 0.00749306, + "balance_loss_clip": 1.002316, + "balance_loss_mlp": 1.00037086, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.8791484627001855, + "language_loss": 0.76704717, + "learning_rate": 3.947254403670641e-06, + "loss": 0.78639197, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.481689453125 + }, + { + "auxiliary_loss_clip": 0.011527, + "auxiliary_loss_mlp": 0.01172322, + "balance_loss_clip": 1.00227904, + "balance_loss_mlp": 1.00132823, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.209946665049864, + "language_loss": 0.9402678, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96351802, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.5303843021392822 + }, + { + "auxiliary_loss_clip": 0.01168873, + "auxiliary_loss_mlp": 0.01172392, + "balance_loss_clip": 1.00219572, + "balance_loss_mlp": 1.0013032, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.984858306063072, + "language_loss": 0.87803179, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90144444, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.52231764793396 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01172236, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00114632, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 2.167095431338149, + "language_loss": 0.74683261, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76992512, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 4.072818279266357 + }, + { + "auxiliary_loss_clip": 0.01152456, + "auxiliary_loss_mlp": 0.01168706, + "balance_loss_clip": 1.00298595, + "balance_loss_mlp": 1.00009656, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7610656865199241, + "language_loss": 0.61066496, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63387656, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.2097599506378174 + }, + { + "auxiliary_loss_clip": 0.01152127, + "auxiliary_loss_mlp": 0.01172518, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00142848, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.9197272943799253, + "language_loss": 0.61875069, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64199716, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 4.066200256347656 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.01172246, + "balance_loss_clip": 1.00219584, + "balance_loss_mlp": 1.00144291, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1197823915221328, + "language_loss": 0.81242281, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83550763, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 2.6833443641662598 + }, + { + "auxiliary_loss_clip": 0.01172116, + "auxiliary_loss_mlp": 0.01172432, + "balance_loss_clip": 1.00253773, + "balance_loss_mlp": 1.0013423, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.744711410453946, + "language_loss": 0.72052163, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74396712, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 4.000543594360352 + }, + { + "auxiliary_loss_clip": 0.01152609, + "auxiliary_loss_mlp": 0.01172708, + "balance_loss_clip": 1.00223732, + "balance_loss_mlp": 1.00171375, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 3.504135938254009, + "language_loss": 0.8705014, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8937546, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 3.987273693084717 + }, + { + "auxiliary_loss_clip": 0.01169223, + "auxiliary_loss_mlp": 0.01172344, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00125515, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 2.866529871850615, + "language_loss": 0.88331068, + "learning_rate": 3.946451730470993e-06, + "loss": 0.9067263, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.6543123722076416 + }, + { + "auxiliary_loss_clip": 0.01151877, + "auxiliary_loss_mlp": 0.01172575, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.0013907, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 2.0750336537231284, + "language_loss": 0.83554482, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85878932, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.6033754348754883 + }, + { + "auxiliary_loss_clip": 0.01152646, + "auxiliary_loss_mlp": 0.01172183, + "balance_loss_clip": 1.00224566, + "balance_loss_mlp": 1.00137949, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.9865752128509016, + "language_loss": 0.66863614, + "learning_rate": 3.946272546655801e-06, + "loss": 0.6918844, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.737262010574341 + }, + { + "auxiliary_loss_clip": 0.01152697, + "auxiliary_loss_mlp": 0.01172809, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00181508, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.7062732936113183, + "language_loss": 0.76191068, + "learning_rate": 3.94618284404223e-06, + "loss": 0.78516573, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.597907543182373 + }, + { + "auxiliary_loss_clip": 0.01137206, + "auxiliary_loss_mlp": 0.01172306, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00131249, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 2.8902493814994985, + "language_loss": 0.87326717, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.8963623, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 2.642019033432007 + }, + { + "auxiliary_loss_clip": 0.01119605, + "auxiliary_loss_mlp": 0.01172215, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00112545, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 2.550304827675248, + "language_loss": 0.79685771, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81977594, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.6966240406036377 + }, + { + "auxiliary_loss_clip": 0.01118893, + "auxiliary_loss_mlp": 0.01172285, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00148201, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7401458025919725, + "language_loss": 0.86612034, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88903213, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.6848702430725098 + }, + { + "auxiliary_loss_clip": 0.01169295, + "auxiliary_loss_mlp": 0.01172198, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.00139511, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 2.339892657506994, + "language_loss": 0.82139146, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84480637, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.581472873687744 + }, + { + "auxiliary_loss_clip": 0.01185187, + "auxiliary_loss_mlp": 0.01172333, + "balance_loss_clip": 1.00238872, + "balance_loss_mlp": 1.0011487, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.986690465365258, + "language_loss": 0.8086319, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83220708, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.509605884552002 + }, + { + "auxiliary_loss_clip": 0.01139534, + "auxiliary_loss_mlp": 0.01172025, + "balance_loss_clip": 1.00216281, + "balance_loss_mlp": 1.00093591, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.2300605093985486, + "language_loss": 0.75991189, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78302747, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.6328859329223633 + }, + { + "auxiliary_loss_clip": 0.01152035, + "auxiliary_loss_mlp": 0.01171873, + "balance_loss_clip": 1.00214291, + "balance_loss_mlp": 1.00116491, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.8084630413612577, + "language_loss": 0.80431002, + "learning_rate": 3.945552859553516e-06, + "loss": 0.8275491, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.572674512863159 + }, + { + "auxiliary_loss_clip": 0.01168423, + "auxiliary_loss_mlp": 0.01172369, + "balance_loss_clip": 1.00217617, + "balance_loss_mlp": 1.00137472, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.9436833343236717, + "language_loss": 0.76716334, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79057121, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.596534013748169 + }, + { + "auxiliary_loss_clip": 0.01168994, + "auxiliary_loss_mlp": 0.01172467, + "balance_loss_clip": 1.00230336, + "balance_loss_mlp": 1.00128222, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 1.9516798109823843, + "language_loss": 0.78127956, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80469424, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.5936617851257324 + }, + { + "auxiliary_loss_clip": 0.01152596, + "auxiliary_loss_mlp": 0.01171917, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.0012095, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.2374977606515722, + "language_loss": 0.94603777, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96928287, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.5763022899627686 + }, + { + "auxiliary_loss_clip": 0.01120985, + "auxiliary_loss_mlp": 0.01167881, + "balance_loss_clip": 1.00278413, + "balance_loss_mlp": 1.00003386, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8776348239634248, + "language_loss": 0.55041116, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57329983, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.164802312850952 + }, + { + "auxiliary_loss_clip": 0.01185081, + "auxiliary_loss_mlp": 0.01172179, + "balance_loss_clip": 1.00234842, + "balance_loss_mlp": 1.0012809, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.4191759037747946, + "language_loss": 0.84082288, + "learning_rate": 3.945100657298039e-06, + "loss": 0.8643955, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.494305372238159 + }, + { + "auxiliary_loss_clip": 0.0115224, + "auxiliary_loss_mlp": 0.01168491, + "balance_loss_clip": 1.0038693, + "balance_loss_mlp": 1.00064421, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7620704651140562, + "language_loss": 0.60441804, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62762535, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.2108154296875 + }, + { + "auxiliary_loss_clip": 0.01137119, + "auxiliary_loss_mlp": 0.01172033, + "balance_loss_clip": 1.00216317, + "balance_loss_mlp": 1.00103903, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.4121438991077615, + "language_loss": 0.86346751, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88655901, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.601393222808838 + }, + { + "auxiliary_loss_clip": 0.01168303, + "auxiliary_loss_mlp": 0.01171925, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00102639, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 2.0478795361540314, + "language_loss": 0.73037004, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75377238, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.587984800338745 + }, + { + "auxiliary_loss_clip": 0.01151715, + "auxiliary_loss_mlp": 0.00749297, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00040317, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7349109068454598, + "language_loss": 0.91118509, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93019521, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.6015448570251465 + }, + { + "auxiliary_loss_clip": 0.01136834, + "auxiliary_loss_mlp": 0.01171727, + "balance_loss_clip": 1.00230777, + "balance_loss_mlp": 1.00120997, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 1.9845203069146242, + "language_loss": 0.88566363, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90874922, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.6491634845733643 + }, + { + "auxiliary_loss_clip": 0.01168907, + "auxiliary_loss_mlp": 0.01171852, + "balance_loss_clip": 1.00224042, + "balance_loss_mlp": 1.00143075, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.7951743603287347, + "language_loss": 0.79475224, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81815982, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.5746006965637207 + }, + { + "auxiliary_loss_clip": 0.01135873, + "auxiliary_loss_mlp": 0.01171905, + "balance_loss_clip": 1.00205994, + "balance_loss_mlp": 1.00129294, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.9468021547260348, + "language_loss": 0.7395885, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76266634, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.655435562133789 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.0117201, + "balance_loss_clip": 1.00243258, + "balance_loss_mlp": 1.0013026, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.866964952492922, + "language_loss": 0.87130368, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8943845, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.7989935874938965 + }, + { + "auxiliary_loss_clip": 0.01168241, + "auxiliary_loss_mlp": 0.01172062, + "balance_loss_clip": 1.00220394, + "balance_loss_mlp": 1.00125909, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.316988665809039, + "language_loss": 0.72600114, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74940413, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.5744521617889404 + }, + { + "auxiliary_loss_clip": 0.01168412, + "auxiliary_loss_mlp": 0.01171958, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00153661, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.9750656389831405, + "language_loss": 0.91028714, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93369085, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.5978872776031494 + }, + { + "auxiliary_loss_clip": 0.0116863, + "auxiliary_loss_mlp": 0.01171523, + "balance_loss_clip": 1.00224578, + "balance_loss_mlp": 1.00100589, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 3.1441309373689865, + "language_loss": 0.75638908, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77979064, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.7023935317993164 + }, + { + "auxiliary_loss_clip": 0.01155181, + "auxiliary_loss_mlp": 0.01172481, + "balance_loss_clip": 1.00219989, + "balance_loss_mlp": 1.0015831, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 1.9727353685535953, + "language_loss": 0.85215056, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87542725, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.6510472297668457 + }, + { + "auxiliary_loss_clip": 0.01136867, + "auxiliary_loss_mlp": 0.01172292, + "balance_loss_clip": 1.00237691, + "balance_loss_mlp": 1.00187039, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 3.560046795152687, + "language_loss": 0.82934082, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85243237, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.6416776180267334 + }, + { + "auxiliary_loss_clip": 0.01168333, + "auxiliary_loss_mlp": 0.01171838, + "balance_loss_clip": 1.00227284, + "balance_loss_mlp": 1.00132108, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 2.1405830320771226, + "language_loss": 0.72877979, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75218147, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.666734457015991 + }, + { + "auxiliary_loss_clip": 0.01168276, + "auxiliary_loss_mlp": 0.01171621, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00119925, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.6749543915786522, + "language_loss": 0.92719007, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.950589, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.4988749027252197 + }, + { + "auxiliary_loss_clip": 0.01135642, + "auxiliary_loss_mlp": 0.01171712, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00109971, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.9704579169791712, + "language_loss": 0.79709649, + "learning_rate": 3.943641220792039e-06, + "loss": 0.82017004, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.606321334838867 + }, + { + "auxiliary_loss_clip": 0.01119123, + "auxiliary_loss_mlp": 0.0117242, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00152206, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.154315473022528, + "language_loss": 0.813164, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.83607942, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 4.120486259460449 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.01166933, + "balance_loss_clip": 1.00283194, + "balance_loss_mlp": 0.99984896, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9257055669660211, + "language_loss": 0.67109621, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69428337, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 4.4037346839904785 + }, + { + "auxiliary_loss_clip": 0.01168301, + "auxiliary_loss_mlp": 0.011719, + "balance_loss_clip": 1.00219679, + "balance_loss_mlp": 1.00128746, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 3.007095630659396, + "language_loss": 0.77611434, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.79951644, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.5205647945404053 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01172285, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00167274, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 2.007888126156277, + "language_loss": 0.7492801, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77236378, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 2.8565964698791504 + }, + { + "auxiliary_loss_clip": 0.0113584, + "auxiliary_loss_mlp": 0.01171776, + "balance_loss_clip": 1.0021894, + "balance_loss_mlp": 1.00125957, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 1.9889382969415175, + "language_loss": 0.75104892, + "learning_rate": 3.943181276805054e-06, + "loss": 0.7741251, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 4.0500547885894775 + }, + { + "auxiliary_loss_clip": 0.01153386, + "auxiliary_loss_mlp": 0.01172054, + "balance_loss_clip": 1.00223017, + "balance_loss_mlp": 1.00134671, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.9790121127569362, + "language_loss": 0.73583472, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75908911, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 4.138055086135864 + }, + { + "auxiliary_loss_clip": 0.01169138, + "auxiliary_loss_mlp": 0.01171531, + "balance_loss_clip": 1.00222945, + "balance_loss_mlp": 1.00120425, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 10.353015592086706, + "language_loss": 0.84894478, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87235147, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.5415332317352295 + }, + { + "auxiliary_loss_clip": 0.01152219, + "auxiliary_loss_mlp": 0.01171287, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00105643, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.2019577741937537, + "language_loss": 0.70845222, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73168725, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.610733985900879 + }, + { + "auxiliary_loss_clip": 0.01169098, + "auxiliary_loss_mlp": 0.01171701, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.00127912, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5915117910619703, + "language_loss": 0.816755, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.84016293, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.4983386993408203 + }, + { + "auxiliary_loss_clip": 0.01070848, + "auxiliary_loss_mlp": 0.01171246, + "balance_loss_clip": 1.00184786, + "balance_loss_mlp": 1.00101495, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.8688476309060074, + "language_loss": 0.75752795, + "learning_rate": 3.942719490677489e-06, + "loss": 0.77994889, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 2.9447247982025146 + }, + { + "auxiliary_loss_clip": 0.01119335, + "auxiliary_loss_mlp": 0.01171507, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.00137126, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8075368553477933, + "language_loss": 0.83110338, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85401183, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 2.9017562866210938 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01171932, + "balance_loss_clip": 1.00232744, + "balance_loss_mlp": 1.00131965, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.0469196085267147, + "language_loss": 0.83460569, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85767907, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.604727268218994 + }, + { + "auxiliary_loss_clip": 0.01151885, + "auxiliary_loss_mlp": 0.01171754, + "balance_loss_clip": 1.00219584, + "balance_loss_mlp": 1.00123656, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.177524175671939, + "language_loss": 0.76630902, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78954542, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.599034309387207 + }, + { + "auxiliary_loss_clip": 0.01135454, + "auxiliary_loss_mlp": 0.01171216, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00117564, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 2.0466739936702414, + "language_loss": 0.75059146, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77365816, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.6427571773529053 + }, + { + "auxiliary_loss_clip": 0.01168255, + "auxiliary_loss_mlp": 0.01171021, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00088596, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9008863386218144, + "language_loss": 0.78434491, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80773771, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.582700729370117 + }, + { + "auxiliary_loss_clip": 0.01167853, + "auxiliary_loss_mlp": 0.01171118, + "balance_loss_clip": 1.00219023, + "balance_loss_mlp": 1.00126815, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8519422735366284, + "language_loss": 0.70760632, + "learning_rate": 3.942162916315356e-06, + "loss": 0.73099595, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.5740296840667725 + }, + { + "auxiliary_loss_clip": 0.01136314, + "auxiliary_loss_mlp": 0.01171265, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00103378, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.2642551550822576, + "language_loss": 0.81550634, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83858216, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.669074058532715 + }, + { + "auxiliary_loss_clip": 0.01184791, + "auxiliary_loss_mlp": 0.01171361, + "balance_loss_clip": 1.00221109, + "balance_loss_mlp": 1.00122511, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.002783648798928, + "language_loss": 0.75000942, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.7735709, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.5141947269439697 + }, + { + "auxiliary_loss_clip": 0.01138512, + "auxiliary_loss_mlp": 0.01171192, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00105667, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 3.1188339147260145, + "language_loss": 0.7719084, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79500544, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.6206040382385254 + }, + { + "auxiliary_loss_clip": 0.01151938, + "auxiliary_loss_mlp": 0.01171898, + "balance_loss_clip": 1.00235033, + "balance_loss_mlp": 1.00157225, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.2696068946030485, + "language_loss": 0.85732508, + "learning_rate": 3.941790393753467e-06, + "loss": 0.8805635, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.597486972808838 + }, + { + "auxiliary_loss_clip": 0.01151438, + "auxiliary_loss_mlp": 0.01171493, + "balance_loss_clip": 1.00212801, + "balance_loss_mlp": 1.00088108, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.3521041987937368, + "language_loss": 0.75849479, + "learning_rate": 3.941697079021942e-06, + "loss": 0.7817241, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.5748653411865234 + }, + { + "auxiliary_loss_clip": 0.01119147, + "auxiliary_loss_mlp": 0.01171322, + "balance_loss_clip": 1.00219965, + "balance_loss_mlp": 1.00128198, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.077307825398657, + "language_loss": 0.87456405, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89746881, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.655977964401245 + }, + { + "auxiliary_loss_clip": 0.01135125, + "auxiliary_loss_mlp": 0.01171151, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00091982, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.015361052386389, + "language_loss": 0.75434452, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77740729, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.6020450592041016 + }, + { + "auxiliary_loss_clip": 0.01167904, + "auxiliary_loss_mlp": 0.01171165, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00102973, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 3.9587368713037243, + "language_loss": 0.79865313, + "learning_rate": 3.941416693065451e-06, + "loss": 0.82204378, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.6731977462768555 + }, + { + "auxiliary_loss_clip": 0.01184771, + "auxiliary_loss_mlp": 0.01171347, + "balance_loss_clip": 1.00225782, + "balance_loss_mlp": 1.00149739, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.422363650868996, + "language_loss": 0.83296883, + "learning_rate": 3.941323083837794e-06, + "loss": 0.85653001, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.5449347496032715 + }, + { + "auxiliary_loss_clip": 0.01152272, + "auxiliary_loss_mlp": 0.01171678, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00154269, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.547534430208533, + "language_loss": 0.70186359, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72510302, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.7330217361450195 + }, + { + "auxiliary_loss_clip": 0.01135658, + "auxiliary_loss_mlp": 0.01171699, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.00137234, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.2959665054372422, + "language_loss": 0.84701419, + "learning_rate": 3.941135644540535e-06, + "loss": 0.87008774, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.6893322467803955 + }, + { + "auxiliary_loss_clip": 0.01184632, + "auxiliary_loss_mlp": 0.01171174, + "balance_loss_clip": 1.00215399, + "balance_loss_mlp": 1.00113392, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.9456757639415758, + "language_loss": 0.71646941, + "learning_rate": 3.941041814478041e-06, + "loss": 0.74002743, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.52713942527771 + }, + { + "auxiliary_loss_clip": 0.01152654, + "auxiliary_loss_mlp": 0.01171153, + "balance_loss_clip": 1.00221312, + "balance_loss_mlp": 1.00120878, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.0715230531578634, + "language_loss": 0.82092935, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84416741, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.569204807281494 + }, + { + "auxiliary_loss_clip": 0.0113901, + "auxiliary_loss_mlp": 0.01171278, + "balance_loss_clip": 1.00230753, + "balance_loss_mlp": 1.00133324, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.455701638781582, + "language_loss": 0.9270547, + "learning_rate": 3.940853933543114e-06, + "loss": 0.95015758, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.648946762084961 + }, + { + "auxiliary_loss_clip": 0.01168098, + "auxiliary_loss_mlp": 0.01171103, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00115788, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 6.2807442330851435, + "language_loss": 0.79512548, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81851751, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.5378921031951904 + }, + { + "auxiliary_loss_clip": 0.01118902, + "auxiliary_loss_mlp": 0.01170966, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00102174, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 1.866087725571391, + "language_loss": 0.76293337, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78583205, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.713716745376587 + }, + { + "auxiliary_loss_clip": 0.01134978, + "auxiliary_loss_mlp": 0.01171631, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00111377, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 1.8548808516197934, + "language_loss": 0.84092426, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86399031, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.6279876232147217 + }, + { + "auxiliary_loss_clip": 0.01119289, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00091791, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 5.257976787665296, + "language_loss": 0.69103211, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71393836, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.646285057067871 + }, + { + "auxiliary_loss_clip": 0.01168941, + "auxiliary_loss_mlp": 0.01171539, + "balance_loss_clip": 1.00234818, + "balance_loss_mlp": 1.00140309, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.5517798950432335, + "language_loss": 0.76695001, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79035479, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.6075849533081055 + }, + { + "auxiliary_loss_clip": 0.0118467, + "auxiliary_loss_mlp": 0.01171352, + "balance_loss_clip": 1.00222397, + "balance_loss_mlp": 1.00140762, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.8182593238470754, + "language_loss": 0.80198097, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82554126, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.5035312175750732 + }, + { + "auxiliary_loss_clip": 0.01136747, + "auxiliary_loss_mlp": 0.01171199, + "balance_loss_clip": 1.00223434, + "balance_loss_mlp": 1.00106335, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 2.235587117400073, + "language_loss": 0.79153782, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81461734, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 2.92559552192688 + }, + { + "auxiliary_loss_clip": 0.01152106, + "auxiliary_loss_mlp": 0.01171038, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00109375, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 1.90248966901661, + "language_loss": 0.91739523, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94062674, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 4.034830808639526 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01170808, + "balance_loss_clip": 1.00199926, + "balance_loss_mlp": 1.00095904, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.4050675188611135, + "language_loss": 0.77650154, + "learning_rate": 3.940004826678365e-06, + "loss": 0.7997303, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 4.029199838638306 + }, + { + "auxiliary_loss_clip": 0.01168902, + "auxiliary_loss_mlp": 0.01171261, + "balance_loss_clip": 1.00233078, + "balance_loss_mlp": 1.00122046, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.372605746709174, + "language_loss": 0.89700967, + "learning_rate": 3.939910113597498e-06, + "loss": 0.92041129, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.5864129066467285 + }, + { + "auxiliary_loss_clip": 0.01103407, + "auxiliary_loss_mlp": 0.00749266, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00033379, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.3581458686000887, + "language_loss": 0.78119022, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.79971695, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.7719290256500244 + }, + { + "auxiliary_loss_clip": 0.01152476, + "auxiliary_loss_mlp": 0.01167443, + "balance_loss_clip": 1.00347662, + "balance_loss_mlp": 1.00035977, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.7588661214925183, + "language_loss": 0.60563433, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62883353, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 4.638633728027344 + }, + { + "auxiliary_loss_clip": 0.01151134, + "auxiliary_loss_mlp": 0.0117092, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00116563, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.6946981209463627, + "language_loss": 0.80077112, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82399166, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 4.082391738891602 + }, + { + "auxiliary_loss_clip": 0.01137059, + "auxiliary_loss_mlp": 0.01170976, + "balance_loss_clip": 1.00224984, + "balance_loss_mlp": 1.00112641, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7654361452179732, + "language_loss": 0.7995134, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82259369, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.6285369396209717 + }, + { + "auxiliary_loss_clip": 0.01167957, + "auxiliary_loss_mlp": 0.0117075, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00118721, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.7300295429924357, + "language_loss": 0.77002925, + "learning_rate": 3.939435444841306e-06, + "loss": 0.79341632, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.566901445388794 + }, + { + "auxiliary_loss_clip": 0.01184765, + "auxiliary_loss_mlp": 0.01171529, + "balance_loss_clip": 1.00230145, + "balance_loss_mlp": 1.00158465, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 10.8445931154091, + "language_loss": 0.774409, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79797202, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.5632128715515137 + }, + { + "auxiliary_loss_clip": 0.01072834, + "auxiliary_loss_mlp": 0.01167252, + "balance_loss_clip": 1.00427437, + "balance_loss_mlp": 1.00016844, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6756375719617378, + "language_loss": 0.57867026, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60107112, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.801330804824829 + }, + { + "auxiliary_loss_clip": 0.01136734, + "auxiliary_loss_mlp": 0.01170905, + "balance_loss_clip": 1.0022819, + "balance_loss_mlp": 1.00096011, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 7.223001879875494, + "language_loss": 0.86630791, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88938433, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 4.03853964805603 + }, + { + "auxiliary_loss_clip": 0.01122788, + "auxiliary_loss_mlp": 0.00749194, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00026035, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 2.835655524100067, + "language_loss": 0.62138212, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64010191, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.820059061050415 + }, + { + "auxiliary_loss_clip": 0.01169062, + "auxiliary_loss_mlp": 0.01167127, + "balance_loss_clip": 1.0032897, + "balance_loss_mlp": 1.0000428, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8791438015196229, + "language_loss": 0.57020652, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59356838, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.067826271057129 + }, + { + "auxiliary_loss_clip": 0.01134955, + "auxiliary_loss_mlp": 0.01171365, + "balance_loss_clip": 1.00212157, + "balance_loss_mlp": 1.00170636, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 3.1937046678902736, + "language_loss": 0.88862854, + "learning_rate": 3.938863415435429e-06, + "loss": 0.91169178, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.6704423427581787 + }, + { + "auxiliary_loss_clip": 0.01184624, + "auxiliary_loss_mlp": 0.01171225, + "balance_loss_clip": 1.00225019, + "balance_loss_mlp": 1.00118446, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.949485857285458, + "language_loss": 0.76139367, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78495216, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.4811348915100098 + }, + { + "auxiliary_loss_clip": 0.01136701, + "auxiliary_loss_mlp": 0.01171275, + "balance_loss_clip": 1.00250411, + "balance_loss_mlp": 1.00123489, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1739819356964842, + "language_loss": 0.83230913, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85538882, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.692490577697754 + }, + { + "auxiliary_loss_clip": 0.01151334, + "auxiliary_loss_mlp": 0.00749232, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00025713, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 3.298094341224004, + "language_loss": 0.76851904, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.7875247, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.5702462196350098 + }, + { + "auxiliary_loss_clip": 0.01184648, + "auxiliary_loss_mlp": 0.01166426, + "balance_loss_clip": 1.00325263, + "balance_loss_mlp": 1.00010574, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8224808931443657, + "language_loss": 0.57448304, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59799373, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.110971212387085 + }, + { + "auxiliary_loss_clip": 0.01152736, + "auxiliary_loss_mlp": 0.0117114, + "balance_loss_clip": 1.00230789, + "balance_loss_mlp": 1.00138581, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6687109879119717, + "language_loss": 0.83689356, + "learning_rate": 3.938384702378727e-06, + "loss": 0.86013228, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.6422550678253174 + }, + { + "auxiliary_loss_clip": 0.01119726, + "auxiliary_loss_mlp": 0.007492, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00030649, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.9310160309137692, + "language_loss": 0.87632626, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501554, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.701327323913574 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.00749223, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00026882, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 3.476143828421171, + "language_loss": 0.84396851, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86249441, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.6863627433776855 + }, + { + "auxiliary_loss_clip": 0.01152267, + "auxiliary_loss_mlp": 0.00749251, + "balance_loss_clip": 1.00233972, + "balance_loss_mlp": 1.00032961, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 1.9263993093835603, + "language_loss": 0.67074215, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.68975729, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.5841925144195557 + }, + { + "auxiliary_loss_clip": 0.01168046, + "auxiliary_loss_mlp": 0.0117081, + "balance_loss_clip": 1.00235093, + "balance_loss_mlp": 1.00105572, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.540507075622147, + "language_loss": 0.91565859, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93904716, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.525865316390991 + }, + { + "auxiliary_loss_clip": 0.01117914, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00125957, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.389085619153735, + "language_loss": 0.79675162, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81963801, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.641360282897949 + }, + { + "auxiliary_loss_clip": 0.0115505, + "auxiliary_loss_mlp": 0.01171257, + "balance_loss_clip": 1.00232291, + "balance_loss_mlp": 1.00131214, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.176727418840421, + "language_loss": 0.79183543, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81509852, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.590956211090088 + }, + { + "auxiliary_loss_clip": 0.01151697, + "auxiliary_loss_mlp": 0.01171118, + "balance_loss_clip": 1.00220835, + "balance_loss_mlp": 1.00117362, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.5480854611577026, + "language_loss": 0.86266077, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88588893, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.586616277694702 + }, + { + "auxiliary_loss_clip": 0.0115154, + "auxiliary_loss_mlp": 0.01171708, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00138235, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.5922233443081835, + "language_loss": 1.00961685, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03284943, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.5751090049743652 + }, + { + "auxiliary_loss_clip": 0.01168027, + "auxiliary_loss_mlp": 0.01171251, + "balance_loss_clip": 1.00229454, + "balance_loss_mlp": 1.00149703, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.9208969666781834, + "language_loss": 0.84859645, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87198925, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.5744125843048096 + }, + { + "auxiliary_loss_clip": 0.01184632, + "auxiliary_loss_mlp": 0.01171302, + "balance_loss_clip": 1.00231814, + "balance_loss_mlp": 1.00116634, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 2.142669893599407, + "language_loss": 0.78371775, + "learning_rate": 3.937421763940642e-06, + "loss": 0.80727708, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.576009511947632 + }, + { + "auxiliary_loss_clip": 0.01167994, + "auxiliary_loss_mlp": 0.01171153, + "balance_loss_clip": 1.00222385, + "balance_loss_mlp": 1.0011127, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 3.766962239285926, + "language_loss": 0.83091593, + "learning_rate": 3.937325065966719e-06, + "loss": 0.85430741, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.5427043437957764 + }, + { + "auxiliary_loss_clip": 0.01184648, + "auxiliary_loss_mlp": 0.01171238, + "balance_loss_clip": 1.00239921, + "balance_loss_mlp": 1.00138855, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.00858322234086, + "language_loss": 0.78141773, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80497664, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.5525717735290527 + }, + { + "auxiliary_loss_clip": 0.01184802, + "auxiliary_loss_mlp": 0.01171411, + "balance_loss_clip": 1.00250673, + "balance_loss_mlp": 1.00108469, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 3.329364301321428, + "language_loss": 0.74992347, + "learning_rate": 3.937131449631859e-06, + "loss": 0.77348554, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.5529322624206543 + }, + { + "auxiliary_loss_clip": 0.01168792, + "auxiliary_loss_mlp": 0.0074927, + "balance_loss_clip": 1.00242233, + "balance_loss_mlp": 1.00035381, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 3.5047734590888315, + "language_loss": 0.78613645, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80531704, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.6171715259552 + }, + { + "auxiliary_loss_clip": 0.01135362, + "auxiliary_loss_mlp": 0.01171476, + "balance_loss_clip": 1.00216198, + "balance_loss_mlp": 1.00172162, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 2.0732677045779004, + "language_loss": 0.71008289, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73315126, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.7183237075805664 + }, + { + "auxiliary_loss_clip": 0.01134602, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00079787, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 4.4910246258295885, + "language_loss": 0.76494467, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78799808, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.6662657260894775 + }, + { + "auxiliary_loss_clip": 0.01119407, + "auxiliary_loss_mlp": 0.01170865, + "balance_loss_clip": 1.00224364, + "balance_loss_mlp": 1.00111055, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.6010144670185051, + "language_loss": 0.85145509, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87435782, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.7377641201019287 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01171759, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00143313, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.8848178764367363, + "language_loss": 0.74838942, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77113825, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.733670473098755 + }, + { + "auxiliary_loss_clip": 0.01120152, + "auxiliary_loss_mlp": 0.01171208, + "balance_loss_clip": 1.00216436, + "balance_loss_mlp": 1.00116765, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.8341231185543525, + "language_loss": 0.8156755, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83858913, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 4.096832752227783 + }, + { + "auxiliary_loss_clip": 0.01153142, + "auxiliary_loss_mlp": 0.01172168, + "balance_loss_clip": 1.00249207, + "balance_loss_mlp": 1.00184214, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.1976156640534796, + "language_loss": 0.741476, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76472914, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.583876371383667 + }, + { + "auxiliary_loss_clip": 0.0115262, + "auxiliary_loss_mlp": 0.01170649, + "balance_loss_clip": 1.00219369, + "balance_loss_mlp": 1.00099003, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.1040348638002717, + "language_loss": 0.81897366, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84220636, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 4.100888013839722 + }, + { + "auxiliary_loss_clip": 0.01134827, + "auxiliary_loss_mlp": 0.01171044, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00119472, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.2612747266925215, + "language_loss": 0.8505398, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87359846, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 2.5932412147521973 + }, + { + "auxiliary_loss_clip": 0.01151887, + "auxiliary_loss_mlp": 0.01171053, + "balance_loss_clip": 1.0022186, + "balance_loss_mlp": 1.00139439, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.0256162876443606, + "language_loss": 0.77274227, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79597163, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 2.582326650619507 + }, + { + "auxiliary_loss_clip": 0.01184362, + "auxiliary_loss_mlp": 0.01170856, + "balance_loss_clip": 1.00222707, + "balance_loss_mlp": 1.00110185, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.7792611223308212, + "language_loss": 0.72873372, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75228596, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 3.9432425498962402 + }, + { + "auxiliary_loss_clip": 0.01184557, + "auxiliary_loss_mlp": 0.01171337, + "balance_loss_clip": 1.00233936, + "balance_loss_mlp": 1.00120103, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 3.337792373675008, + "language_loss": 0.66625679, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68981576, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 4.091436386108398 + }, + { + "auxiliary_loss_clip": 0.01168855, + "auxiliary_loss_mlp": 0.01171209, + "balance_loss_clip": 1.0024271, + "balance_loss_mlp": 1.00135934, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.8561884649836813, + "language_loss": 0.81554651, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83894718, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.5431149005889893 + }, + { + "auxiliary_loss_clip": 0.01167955, + "auxiliary_loss_mlp": 0.01170906, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00105619, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.7356913804596747, + "language_loss": 0.90928674, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93267536, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.614423990249634 + }, + { + "auxiliary_loss_clip": 0.01118727, + "auxiliary_loss_mlp": 0.01171432, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00120127, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 3.1325369521190445, + "language_loss": 0.76511157, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78801322, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.7617478370666504 + }, + { + "auxiliary_loss_clip": 0.01152488, + "auxiliary_loss_mlp": 0.01171132, + "balance_loss_clip": 1.00264645, + "balance_loss_mlp": 1.00128293, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 28.725235950039444, + "language_loss": 0.86176741, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88500357, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.6768736839294434 + }, + { + "auxiliary_loss_clip": 0.01168225, + "auxiliary_loss_mlp": 0.0074929, + "balance_loss_clip": 1.0023247, + "balance_loss_mlp": 1.00026786, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.1106246444668955, + "language_loss": 0.80935913, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82853425, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.601341962814331 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01171239, + "balance_loss_clip": 1.00217319, + "balance_loss_mlp": 1.00138962, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 2.021269225701869, + "language_loss": 0.78859782, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81166196, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.665153980255127 + }, + { + "auxiliary_loss_clip": 0.01151525, + "auxiliary_loss_mlp": 0.01170974, + "balance_loss_clip": 1.00226521, + "balance_loss_mlp": 1.00131559, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 2.291417993923385, + "language_loss": 0.79104245, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81426746, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.6132991313934326 + }, + { + "auxiliary_loss_clip": 0.01184349, + "auxiliary_loss_mlp": 0.01170916, + "balance_loss_clip": 1.00226474, + "balance_loss_mlp": 1.0012573, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0594167431572346, + "language_loss": 0.85248756, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87604022, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.489182710647583 + }, + { + "auxiliary_loss_clip": 0.01137297, + "auxiliary_loss_mlp": 0.01171053, + "balance_loss_clip": 1.00229168, + "balance_loss_mlp": 1.00110817, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.8915791271514188, + "language_loss": 0.63789946, + "learning_rate": 3.935080744080564e-06, + "loss": 0.66098297, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.661299705505371 + }, + { + "auxiliary_loss_clip": 0.01154985, + "auxiliary_loss_mlp": 0.01171053, + "balance_loss_clip": 1.00221944, + "balance_loss_mlp": 1.00110841, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.2485158995971957, + "language_loss": 0.74261028, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76587069, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.725102424621582 + }, + { + "auxiliary_loss_clip": 0.01151974, + "auxiliary_loss_mlp": 0.01170589, + "balance_loss_clip": 1.00214744, + "balance_loss_mlp": 1.00102556, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.5721239045896667, + "language_loss": 0.73079687, + "learning_rate": 3.934883750543966e-06, + "loss": 0.75402248, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.598275661468506 + }, + { + "auxiliary_loss_clip": 0.01153038, + "auxiliary_loss_mlp": 0.01170916, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.00144792, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.739686816861911, + "language_loss": 0.82971907, + "learning_rate": 3.93478514371732e-06, + "loss": 0.85295856, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.6077582836151123 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01171099, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00125003, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3203939425434843, + "language_loss": 0.84184372, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86489737, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.6304032802581787 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01170962, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.0011121, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.4692501407511, + "language_loss": 0.71396524, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73702872, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.604145050048828 + }, + { + "auxiliary_loss_clip": 0.01168672, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00150394, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 3.6810784764820808, + "language_loss": 0.73160017, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75499845, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.5967204570770264 + }, + { + "auxiliary_loss_clip": 0.01101708, + "auxiliary_loss_mlp": 0.01170958, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00129986, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.6683089563849087, + "language_loss": 0.66926885, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6919955, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.9208104610443115 + }, + { + "auxiliary_loss_clip": 0.01151798, + "auxiliary_loss_mlp": 0.01171046, + "balance_loss_clip": 1.00238776, + "balance_loss_mlp": 1.00129187, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.710928475128763, + "language_loss": 0.73196483, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75519323, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.801795482635498 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.00749167, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00020289, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.819851736252333, + "language_loss": 0.74180061, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76081288, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.5935957431793213 + }, + { + "auxiliary_loss_clip": 0.01184429, + "auxiliary_loss_mlp": 0.01170999, + "balance_loss_clip": 1.00238252, + "balance_loss_mlp": 1.00162673, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.4128842618885624, + "language_loss": 0.8317529, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85530716, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.463148593902588 + }, + { + "auxiliary_loss_clip": 0.01151848, + "auxiliary_loss_mlp": 0.01170549, + "balance_loss_clip": 1.00235152, + "balance_loss_mlp": 1.00108123, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.2302102911204744, + "language_loss": 0.76191485, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78513885, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.6214606761932373 + }, + { + "auxiliary_loss_clip": 0.0115221, + "auxiliary_loss_mlp": 0.01170612, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.00133443, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.7535397772208647, + "language_loss": 0.79759294, + "learning_rate": 3.933894381201034e-06, + "loss": 0.82082117, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.6608102321624756 + }, + { + "auxiliary_loss_clip": 0.01151818, + "auxiliary_loss_mlp": 0.01170383, + "balance_loss_clip": 1.00228238, + "balance_loss_mlp": 1.00110614, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.4694940166475077, + "language_loss": 0.7976349, + "learning_rate": 3.933795040870645e-06, + "loss": 0.82085687, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01170743, + "balance_loss_clip": 1.00215459, + "balance_loss_mlp": 1.00127459, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.7856321266161337, + "language_loss": 0.87968361, + "learning_rate": 3.933695627210554e-06, + "loss": 0.9029063, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.5771994590759277 + }, + { + "auxiliary_loss_clip": 0.01136071, + "auxiliary_loss_mlp": 0.01171087, + "balance_loss_clip": 1.0022049, + "balance_loss_mlp": 1.00142801, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8188999447202376, + "language_loss": 0.76527792, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78834951, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.755735158920288 + }, + { + "auxiliary_loss_clip": 0.01167554, + "auxiliary_loss_mlp": 0.01167249, + "balance_loss_clip": 1.00313008, + "balance_loss_mlp": 1.00092852, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8364222776467178, + "language_loss": 0.54963112, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57297921, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.145660161972046 + }, + { + "auxiliary_loss_clip": 0.01184328, + "auxiliary_loss_mlp": 0.0116732, + "balance_loss_clip": 1.00324297, + "balance_loss_mlp": 1.00099874, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7492060348925742, + "language_loss": 0.55321491, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57673138, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.1033856868743896 + }, + { + "auxiliary_loss_clip": 0.011513, + "auxiliary_loss_mlp": 0.01170908, + "balance_loss_clip": 1.0022943, + "balance_loss_mlp": 1.00115383, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.9798585935855204, + "language_loss": 0.84404719, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86726928, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.6097843647003174 + }, + { + "auxiliary_loss_clip": 0.01117759, + "auxiliary_loss_mlp": 0.01171243, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00177479, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 13.489150264711064, + "language_loss": 0.89160186, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91449189, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.836489677429199 + }, + { + "auxiliary_loss_clip": 0.01150671, + "auxiliary_loss_mlp": 0.01167105, + "balance_loss_clip": 1.00311351, + "balance_loss_mlp": 1.0007838, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6918302780985, + "language_loss": 0.5546391, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57781684, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.149014711380005 + }, + { + "auxiliary_loss_clip": 0.01151137, + "auxiliary_loss_mlp": 0.01170975, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00150681, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.8248193297612914, + "language_loss": 0.91132486, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93454593, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.619755983352661 + }, + { + "auxiliary_loss_clip": 0.01171779, + "auxiliary_loss_mlp": 0.01166653, + "balance_loss_clip": 1.0034306, + "balance_loss_mlp": 1.00033176, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7127678423127491, + "language_loss": 0.59893936, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62232369, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.1007368564605713 + }, + { + "auxiliary_loss_clip": 0.0116856, + "auxiliary_loss_mlp": 0.01170922, + "balance_loss_clip": 1.00223565, + "balance_loss_mlp": 1.00116801, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6965616441627205, + "language_loss": 0.8067258, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83012062, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 3.9856903553009033 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01170761, + "balance_loss_clip": 1.00221491, + "balance_loss_mlp": 1.00138879, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.4687095852763323, + "language_loss": 0.90857613, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93147701, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 4.140822887420654 + }, + { + "auxiliary_loss_clip": 0.01135633, + "auxiliary_loss_mlp": 0.01170733, + "balance_loss_clip": 1.00246847, + "balance_loss_mlp": 1.00126553, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.1438186964173074, + "language_loss": 0.6386975, + "learning_rate": 3.932597238269386e-06, + "loss": 0.66176116, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.6100881099700928 + }, + { + "auxiliary_loss_clip": 0.01135529, + "auxiliary_loss_mlp": 0.01171149, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00168133, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 2.2024541133547606, + "language_loss": 0.72851205, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75157881, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.7044484615325928 + }, + { + "auxiliary_loss_clip": 0.01168754, + "auxiliary_loss_mlp": 0.01170889, + "balance_loss_clip": 1.00241733, + "balance_loss_mlp": 1.00161135, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.066978744404184, + "language_loss": 0.78702092, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81041729, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 3.89617919921875 + }, + { + "auxiliary_loss_clip": 0.01150831, + "auxiliary_loss_mlp": 0.01170621, + "balance_loss_clip": 1.00222003, + "balance_loss_mlp": 1.00153422, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 2.0034498802674774, + "language_loss": 0.71907699, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74229151, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 3.942697525024414 + }, + { + "auxiliary_loss_clip": 0.011846, + "auxiliary_loss_mlp": 0.00749141, + "balance_loss_clip": 1.00256836, + "balance_loss_mlp": 1.00020778, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.5450472880757777, + "language_loss": 0.78527206, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80460948, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.4931139945983887 + }, + { + "auxiliary_loss_clip": 0.01168374, + "auxiliary_loss_mlp": 0.01170206, + "balance_loss_clip": 1.00237167, + "balance_loss_mlp": 1.00102377, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.7960697086159876, + "language_loss": 0.88071394, + "learning_rate": 3.932095038894311e-06, + "loss": 0.9040997, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.5588672161102295 + }, + { + "auxiliary_loss_clip": 0.01135924, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_clip": 1.0022341, + "balance_loss_mlp": 1.0016408, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 1.7299269716394448, + "language_loss": 0.90366805, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92673266, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.606898069381714 + }, + { + "auxiliary_loss_clip": 0.0115145, + "auxiliary_loss_mlp": 0.01171019, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00155163, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 1.9393961528443617, + "language_loss": 0.86030638, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88353103, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.5716452598571777 + }, + { + "auxiliary_loss_clip": 0.01118447, + "auxiliary_loss_mlp": 0.00749213, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00020754, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.694760015024452, + "language_loss": 0.74496484, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76364148, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.716792345046997 + }, + { + "auxiliary_loss_clip": 0.01184217, + "auxiliary_loss_mlp": 0.011708, + "balance_loss_clip": 1.00232077, + "balance_loss_mlp": 1.00142753, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0439019419531372, + "language_loss": 0.75660205, + "learning_rate": 3.931691960597165e-06, + "loss": 0.7801522, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 2.471189498901367 + }, + { + "auxiliary_loss_clip": 0.01151074, + "auxiliary_loss_mlp": 0.01170613, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00143075, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.5443058693709466, + "language_loss": 0.76393139, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7871483, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.559767007827759 + }, + { + "auxiliary_loss_clip": 0.01167994, + "auxiliary_loss_mlp": 0.01171261, + "balance_loss_clip": 1.00237036, + "balance_loss_mlp": 1.00150657, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.332317893485439, + "language_loss": 0.86317778, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88657033, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.4959604740142822 + }, + { + "auxiliary_loss_clip": 0.01184379, + "auxiliary_loss_mlp": 0.01170704, + "balance_loss_clip": 1.00237513, + "balance_loss_mlp": 1.00133157, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 4.068473528595831, + "language_loss": 0.7712605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79481131, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.5016872882843018 + }, + { + "auxiliary_loss_clip": 0.01168314, + "auxiliary_loss_mlp": 0.01170446, + "balance_loss_clip": 1.00250983, + "balance_loss_mlp": 1.00135994, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.7129215676930523, + "language_loss": 0.77715528, + "learning_rate": 3.931287710300832e-06, + "loss": 0.80054289, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 2.54099702835083 + }, + { + "auxiliary_loss_clip": 0.01134421, + "auxiliary_loss_mlp": 0.0074915, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00015187, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 6.243875587898013, + "language_loss": 0.71944356, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73827934, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.599500894546509 + }, + { + "auxiliary_loss_clip": 0.01167917, + "auxiliary_loss_mlp": 0.01170861, + "balance_loss_clip": 1.00240827, + "balance_loss_mlp": 1.00139356, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.870289640434342, + "language_loss": 0.81222731, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83561504, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.5510945320129395 + }, + { + "auxiliary_loss_clip": 0.01168216, + "auxiliary_loss_mlp": 0.01170885, + "balance_loss_clip": 1.00255632, + "balance_loss_mlp": 1.00141716, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.524123786764863, + "language_loss": 0.88273579, + "learning_rate": 3.930983753601631e-06, + "loss": 0.9061268, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.625058650970459 + }, + { + "auxiliary_loss_clip": 0.01167704, + "auxiliary_loss_mlp": 0.01170644, + "balance_loss_clip": 1.00232005, + "balance_loss_mlp": 1.00127125, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 1.956016918522906, + "language_loss": 0.72168672, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74507022, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.547560930252075 + }, + { + "auxiliary_loss_clip": 0.01184079, + "auxiliary_loss_mlp": 0.01166629, + "balance_loss_clip": 1.00310671, + "balance_loss_mlp": 1.00030804, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7750480990639411, + "language_loss": 0.5358783, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55938536, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.057154893875122 + }, + { + "auxiliary_loss_clip": 0.01151868, + "auxiliary_loss_mlp": 0.01170866, + "balance_loss_clip": 1.00223899, + "balance_loss_mlp": 1.00139832, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.0382679769478305, + "language_loss": 0.84842646, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.8716538, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.5718255043029785 + }, + { + "auxiliary_loss_clip": 0.01152465, + "auxiliary_loss_mlp": 0.01171012, + "balance_loss_clip": 1.00245833, + "balance_loss_mlp": 1.00173473, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2360047702696244, + "language_loss": 0.81886184, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84209657, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.5732483863830566 + }, + { + "auxiliary_loss_clip": 0.0116749, + "auxiliary_loss_mlp": 0.01170317, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00142109, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 3.3656454897519748, + "language_loss": 0.83364975, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85702789, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.561582326889038 + }, + { + "auxiliary_loss_clip": 0.01134989, + "auxiliary_loss_mlp": 0.01170402, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00131595, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.279281200769941, + "language_loss": 0.83230722, + "learning_rate": 3.930373863283608e-06, + "loss": 0.8553611, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.5761520862579346 + }, + { + "auxiliary_loss_clip": 0.0113566, + "auxiliary_loss_mlp": 0.01170364, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00118184, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.7029769300988784, + "language_loss": 0.9189899, + "learning_rate": 3.930271958674866e-06, + "loss": 0.9420501, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.6272084712982178 + }, + { + "auxiliary_loss_clip": 0.0116746, + "auxiliary_loss_mlp": 0.01170553, + "balance_loss_clip": 1.00228703, + "balance_loss_mlp": 1.00127602, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.2565690446289017, + "language_loss": 0.81873655, + "learning_rate": 3.930169980870018e-06, + "loss": 0.84211671, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.551555871963501 + }, + { + "auxiliary_loss_clip": 0.01151374, + "auxiliary_loss_mlp": 0.01170677, + "balance_loss_clip": 1.00215971, + "balance_loss_mlp": 1.00168562, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.050588388391105, + "language_loss": 0.7554853, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77870572, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.559832811355591 + }, + { + "auxiliary_loss_clip": 0.01184247, + "auxiliary_loss_mlp": 0.01169959, + "balance_loss_clip": 1.00238264, + "balance_loss_mlp": 1.0012536, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 3.108946958506602, + "language_loss": 0.88826752, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91180956, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.5167925357818604 + }, + { + "auxiliary_loss_clip": 0.01151443, + "auxiliary_loss_mlp": 0.01169953, + "balance_loss_clip": 1.00221884, + "balance_loss_mlp": 1.00124764, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.2936296090995474, + "language_loss": 0.87171805, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89493203, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.5997252464294434 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01170075, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00098813, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.09054308383794, + "language_loss": 0.64486659, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66808379, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.5847721099853516 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01170478, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00148714, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 3.1858350009151857, + "language_loss": 0.74037319, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76309633, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.672532796859741 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01170645, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00117695, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.1142770489723715, + "language_loss": 0.84620291, + "learning_rate": 3.929556577139446e-06, + "loss": 0.868945, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.6900970935821533 + }, + { + "auxiliary_loss_clip": 0.01104531, + "auxiliary_loss_mlp": 0.00749087, + "balance_loss_clip": 1.0021019, + "balance_loss_mlp": 1.00016057, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5647269512459447, + "language_loss": 0.81609666, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83463287, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.7732293605804443 + }, + { + "auxiliary_loss_clip": 0.01184216, + "auxiliary_loss_mlp": 0.01170398, + "balance_loss_clip": 1.00233245, + "balance_loss_mlp": 1.00131118, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.1013622681072226, + "language_loss": 0.86854219, + "learning_rate": 3.929351523836035e-06, + "loss": 0.89208835, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.5562050342559814 + }, + { + "auxiliary_loss_clip": 0.01151607, + "auxiliary_loss_mlp": 0.00749051, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00005686, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.3029157158817455, + "language_loss": 0.68363738, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70264399, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.54557728767395 + }, + { + "auxiliary_loss_clip": 0.01135356, + "auxiliary_loss_mlp": 0.01171245, + "balance_loss_clip": 1.00209415, + "balance_loss_mlp": 1.00187242, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.932629178647487, + "language_loss": 0.77615434, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79922032, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.6177210807800293 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01170564, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00157249, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 2.0514692286073837, + "language_loss": 0.76583111, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78871924, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 4.08106803894043 + }, + { + "auxiliary_loss_clip": 0.01101065, + "auxiliary_loss_mlp": 0.0117039, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00120795, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.2940450958296474, + "language_loss": 0.81740654, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84012103, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 4.161121606826782 + }, + { + "auxiliary_loss_clip": 0.01184507, + "auxiliary_loss_mlp": 0.01170391, + "balance_loss_clip": 1.00250578, + "balance_loss_mlp": 1.00120926, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7035162413443885, + "language_loss": 0.83273005, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85627908, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.497173547744751 + }, + { + "auxiliary_loss_clip": 0.01134346, + "auxiliary_loss_mlp": 0.01170598, + "balance_loss_clip": 1.00213277, + "balance_loss_mlp": 1.0013212, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 1.8521295811602014, + "language_loss": 0.92394471, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94699413, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.6747100353240967 + }, + { + "auxiliary_loss_clip": 0.0113677, + "auxiliary_loss_mlp": 0.01170877, + "balance_loss_clip": 1.00221896, + "balance_loss_mlp": 1.00179029, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5916322079145815, + "language_loss": 0.75686491, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77994144, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.6091151237487793 + }, + { + "auxiliary_loss_clip": 0.01167951, + "auxiliary_loss_mlp": 0.01170366, + "balance_loss_clip": 1.00245833, + "balance_loss_mlp": 1.00147009, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.9589059191937033, + "language_loss": 0.71918809, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74257123, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 4.01791787147522 + }, + { + "auxiliary_loss_clip": 0.01151721, + "auxiliary_loss_mlp": 0.01170033, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.00123215, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.8404590598549169, + "language_loss": 0.77198648, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79520404, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 4.004232883453369 + }, + { + "auxiliary_loss_clip": 0.01168565, + "auxiliary_loss_mlp": 0.01170749, + "balance_loss_clip": 1.0024941, + "balance_loss_mlp": 1.00156713, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.728981745976354, + "language_loss": 0.87970018, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90309334, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.5456502437591553 + }, + { + "auxiliary_loss_clip": 0.01134503, + "auxiliary_loss_mlp": 0.01170471, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00128889, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.4655065896233674, + "language_loss": 0.81075668, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83380646, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.6237688064575195 + }, + { + "auxiliary_loss_clip": 0.01152645, + "auxiliary_loss_mlp": 0.01170827, + "balance_loss_clip": 1.00223541, + "balance_loss_mlp": 1.00164497, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.011694433958673, + "language_loss": 0.70167071, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72490549, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.662274122238159 + }, + { + "auxiliary_loss_clip": 0.01155135, + "auxiliary_loss_mlp": 0.01170466, + "balance_loss_clip": 1.00283611, + "balance_loss_mlp": 1.00137973, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 5.837366024915783, + "language_loss": 0.72450304, + "learning_rate": 3.928011545540734e-06, + "loss": 0.7477591, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.6227025985717773 + }, + { + "auxiliary_loss_clip": 0.01135087, + "auxiliary_loss_mlp": 0.00749206, + "balance_loss_clip": 1.00211012, + "balance_loss_mlp": 1.00026476, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.2277364596941607, + "language_loss": 0.74444366, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76328659, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.6829493045806885 + }, + { + "auxiliary_loss_clip": 0.01184461, + "auxiliary_loss_mlp": 0.01170625, + "balance_loss_clip": 1.00246, + "balance_loss_mlp": 1.00125277, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.4169633472441494, + "language_loss": 0.79557765, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81912851, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.5577402114868164 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01170306, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00131464, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.269308150338908, + "language_loss": 0.77821803, + "learning_rate": 3.927700564817529e-06, + "loss": 0.80127609, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.01167368, + "auxiliary_loss_mlp": 0.01167122, + "balance_loss_clip": 1.00299215, + "balance_loss_mlp": 1.0015645, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7932495050446176, + "language_loss": 0.55272293, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57606786, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.060703754425049 + }, + { + "auxiliary_loss_clip": 0.01086079, + "auxiliary_loss_mlp": 0.01170224, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00132823, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 1.9620387898030034, + "language_loss": 0.90484774, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92741078, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 2.769960641860962 + }, + { + "auxiliary_loss_clip": 0.01120669, + "auxiliary_loss_mlp": 0.0116975, + "balance_loss_clip": 1.0021925, + "balance_loss_mlp": 1.00114048, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 1.820882658432663, + "language_loss": 0.84979355, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87269771, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 2.701941967010498 + }, + { + "auxiliary_loss_clip": 0.01134021, + "auxiliary_loss_mlp": 0.01169946, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00114584, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 4.211636751790319, + "language_loss": 0.76741111, + "learning_rate": 3.927284900491277e-06, + "loss": 0.79045075, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.6484785079956055 + }, + { + "auxiliary_loss_clip": 0.01118929, + "auxiliary_loss_mlp": 0.011706, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00141788, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.3745858242498854, + "language_loss": 0.68518132, + "learning_rate": 3.927180801692764e-06, + "loss": 0.7080766, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 2.8518168926239014 + }, + { + "auxiliary_loss_clip": 0.01184307, + "auxiliary_loss_mlp": 0.0117003, + "balance_loss_clip": 1.00238729, + "balance_loss_mlp": 1.00122952, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 1.7953433111500419, + "language_loss": 0.84044552, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86398888, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.5445916652679443 + }, + { + "auxiliary_loss_clip": 0.01155033, + "auxiliary_loss_mlp": 0.01170242, + "balance_loss_clip": 1.0025816, + "balance_loss_mlp": 1.00153697, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.268696673154298, + "language_loss": 0.64749992, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67075264, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.6116559505462646 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01169873, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00116801, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.0193909547797415, + "language_loss": 0.88630676, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90936148, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.6503844261169434 + }, + { + "auxiliary_loss_clip": 0.01102294, + "auxiliary_loss_mlp": 0.01171025, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.001652, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 4.157615897178576, + "language_loss": 0.72681689, + "learning_rate": 3.926763675749339e-06, + "loss": 0.74954998, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.725301742553711 + }, + { + "auxiliary_loss_clip": 0.0118412, + "auxiliary_loss_mlp": 0.01170071, + "balance_loss_clip": 1.00218463, + "balance_loss_mlp": 1.00127125, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 1.9810256148339036, + "language_loss": 0.7978543, + "learning_rate": 3.92665921159591e-06, + "loss": 0.82139623, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.5028843879699707 + }, + { + "auxiliary_loss_clip": 0.01151412, + "auxiliary_loss_mlp": 0.01170417, + "balance_loss_clip": 1.00238347, + "balance_loss_mlp": 1.00142539, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 3.0214081732581595, + "language_loss": 0.79891151, + "learning_rate": 3.926554674383371e-06, + "loss": 0.82212985, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.655856132507324 + }, + { + "auxiliary_loss_clip": 0.0118431, + "auxiliary_loss_mlp": 0.01167144, + "balance_loss_clip": 1.00343657, + "balance_loss_mlp": 1.00158596, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8022321659213598, + "language_loss": 0.63337463, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65688914, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.1617610454559326 + }, + { + "auxiliary_loss_clip": 0.01150873, + "auxiliary_loss_mlp": 0.01169812, + "balance_loss_clip": 1.00228143, + "balance_loss_mlp": 1.00120211, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6024342572793067, + "language_loss": 0.85292274, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87612957, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.6414268016815186 + }, + { + "auxiliary_loss_clip": 0.01184318, + "auxiliary_loss_mlp": 0.00749079, + "balance_loss_clip": 1.00247741, + "balance_loss_mlp": 1.00021005, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.630773498622483, + "language_loss": 0.79913563, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.81846952, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.541069507598877 + }, + { + "auxiliary_loss_clip": 0.01119369, + "auxiliary_loss_mlp": 0.0117019, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00119925, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.8769002857080022, + "language_loss": 0.73267186, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75556743, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.8398613929748535 + }, + { + "auxiliary_loss_clip": 0.01119168, + "auxiliary_loss_mlp": 0.01166387, + "balance_loss_clip": 1.00289345, + "balance_loss_mlp": 1.00082874, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9291995682493421, + "language_loss": 0.63395375, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65680927, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.3804101943969727 + }, + { + "auxiliary_loss_clip": 0.01102411, + "auxiliary_loss_mlp": 0.01170063, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00145316, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.54190132741197, + "language_loss": 0.78269279, + "learning_rate": 3.925925917089001e-06, + "loss": 0.8054176, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.9865987300872803 + }, + { + "auxiliary_loss_clip": 0.01168369, + "auxiliary_loss_mlp": 0.01170497, + "balance_loss_clip": 1.00235426, + "balance_loss_mlp": 1.00160182, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.0169244182226036, + "language_loss": 0.83524513, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85863376, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.519533157348633 + }, + { + "auxiliary_loss_clip": 0.01168262, + "auxiliary_loss_mlp": 0.01169892, + "balance_loss_clip": 1.00232029, + "balance_loss_mlp": 1.00109124, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.9970701724476438, + "language_loss": 0.77999324, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80337477, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.5763845443725586 + }, + { + "auxiliary_loss_clip": 0.01151767, + "auxiliary_loss_mlp": 0.01169561, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00104713, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.3947172365257887, + "language_loss": 0.75814521, + "learning_rate": 3.925610552465539e-06, + "loss": 0.78135842, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.6088078022003174 + }, + { + "auxiliary_loss_clip": 0.01152054, + "auxiliary_loss_mlp": 0.01169778, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.00097728, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.2482430344691604, + "language_loss": 0.9237895, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94700778, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.616651773452759 + }, + { + "auxiliary_loss_clip": 0.01155617, + "auxiliary_loss_mlp": 0.01169975, + "balance_loss_clip": 1.00257671, + "balance_loss_mlp": 1.00088835, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.250165621452638, + "language_loss": 0.77023041, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79348636, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.598189115524292 + }, + { + "auxiliary_loss_clip": 0.01184048, + "auxiliary_loss_mlp": 0.01170188, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00129199, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.3908641368790153, + "language_loss": 0.82463419, + "learning_rate": 3.925294530667986e-06, + "loss": 0.8481766, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.5188257694244385 + }, + { + "auxiliary_loss_clip": 0.01135421, + "auxiliary_loss_mlp": 0.01170213, + "balance_loss_clip": 1.00216949, + "balance_loss_mlp": 1.00150788, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 3.3699080098949294, + "language_loss": 0.8494646, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87252092, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 4.059621334075928 + }, + { + "auxiliary_loss_clip": 0.01117932, + "auxiliary_loss_mlp": 0.01166674, + "balance_loss_clip": 1.00296402, + "balance_loss_mlp": 1.00111568, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9326109503526261, + "language_loss": 0.61016989, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63301593, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 2.9930663108825684 + }, + { + "auxiliary_loss_clip": 0.01184241, + "auxiliary_loss_mlp": 0.01169993, + "balance_loss_clip": 1.00251675, + "balance_loss_mlp": 1.00128758, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 1.8271859019376024, + "language_loss": 0.7929945, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8165369, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 3.9146416187286377 + }, + { + "auxiliary_loss_clip": 0.01154869, + "auxiliary_loss_mlp": 0.01170088, + "balance_loss_clip": 1.00263906, + "balance_loss_mlp": 1.00128722, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.6056658468134377, + "language_loss": 0.77098441, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79423404, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.577069044113159 + }, + { + "auxiliary_loss_clip": 0.01151949, + "auxiliary_loss_mlp": 0.0116938, + "balance_loss_clip": 1.00230443, + "balance_loss_mlp": 1.00105619, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.7354099613985228, + "language_loss": 0.79257268, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81578594, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.6281397342681885 + }, + { + "auxiliary_loss_clip": 0.01184032, + "auxiliary_loss_mlp": 0.0074912, + "balance_loss_clip": 1.00229764, + "balance_loss_mlp": 1.00013804, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9598606727618288, + "language_loss": 0.7784512, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79778266, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.5089874267578125 + }, + { + "auxiliary_loss_clip": 0.0116728, + "auxiliary_loss_mlp": 0.01169584, + "balance_loss_clip": 1.00218856, + "balance_loss_mlp": 1.00097442, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 2.3774038685729417, + "language_loss": 0.70566952, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72903812, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 5.362022399902344 + }, + { + "auxiliary_loss_clip": 0.01104846, + "auxiliary_loss_mlp": 0.01165607, + "balance_loss_clip": 1.00376415, + "balance_loss_mlp": 1.00004959, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.990220606434683, + "language_loss": 0.61012411, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63282859, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 3.5132172107696533 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01170089, + "balance_loss_clip": 1.00223505, + "balance_loss_mlp": 1.00128829, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 4.301768493723945, + "language_loss": 0.93285203, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95623022, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.741600751876831 + }, + { + "auxiliary_loss_clip": 0.0115129, + "auxiliary_loss_mlp": 0.0117017, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00136948, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 2.1089416327030617, + "language_loss": 0.72482967, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74804431, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.6351842880249023 + }, + { + "auxiliary_loss_clip": 0.01134081, + "auxiliary_loss_mlp": 0.01169415, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00090122, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.262709622984996, + "language_loss": 0.74523282, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76826781, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.6343491077423096 + }, + { + "auxiliary_loss_clip": 0.01134995, + "auxiliary_loss_mlp": 0.011694, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00088537, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.20562298603318, + "language_loss": 0.86704558, + "learning_rate": 3.92402387389729e-06, + "loss": 0.89008945, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.6028144359588623 + }, + { + "auxiliary_loss_clip": 0.01135294, + "auxiliary_loss_mlp": 0.01169436, + "balance_loss_clip": 1.00212824, + "balance_loss_mlp": 1.00111222, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 3.098421368023188, + "language_loss": 0.86546087, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88850814, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.6315059661865234 + }, + { + "auxiliary_loss_clip": 0.01167444, + "auxiliary_loss_mlp": 0.01169252, + "balance_loss_clip": 1.00234365, + "balance_loss_mlp": 1.00102437, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.451976356117284, + "language_loss": 0.80122793, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82459497, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.5886073112487793 + }, + { + "auxiliary_loss_clip": 0.01167771, + "auxiliary_loss_mlp": 0.01169864, + "balance_loss_clip": 1.0021919, + "balance_loss_mlp": 1.00125408, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.310060292499044, + "language_loss": 0.79027975, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81365615, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 2.554142713546753 + }, + { + "auxiliary_loss_clip": 0.01068658, + "auxiliary_loss_mlp": 0.01169992, + "balance_loss_clip": 1.0016644, + "balance_loss_mlp": 1.00166845, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.9225916933066887, + "language_loss": 0.84554183, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86792833, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.837956666946411 + }, + { + "auxiliary_loss_clip": 0.01167381, + "auxiliary_loss_mlp": 0.01169534, + "balance_loss_clip": 1.00223112, + "balance_loss_mlp": 1.00111532, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0554231725042684, + "language_loss": 0.81046736, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.8338365, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 2.5396833419799805 + }, + { + "auxiliary_loss_clip": 0.01150433, + "auxiliary_loss_mlp": 0.01165247, + "balance_loss_clip": 1.00287151, + "balance_loss_mlp": 1.00045168, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.829525183641644, + "language_loss": 0.61111599, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63427281, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.219780683517456 + }, + { + "auxiliary_loss_clip": 0.01168261, + "auxiliary_loss_mlp": 0.0116972, + "balance_loss_clip": 1.00231504, + "balance_loss_mlp": 1.00158763, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.8876239404450215, + "language_loss": 0.75513887, + "learning_rate": 3.923277805217161e-06, + "loss": 0.7785188, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.5884833335876465 + }, + { + "auxiliary_loss_clip": 0.01103818, + "auxiliary_loss_mlp": 0.00749131, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.00022614, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.81348992265938, + "language_loss": 0.73113179, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74966127, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.7608814239501953 + }, + { + "auxiliary_loss_clip": 0.01151725, + "auxiliary_loss_mlp": 0.01169524, + "balance_loss_clip": 1.00227237, + "balance_loss_mlp": 1.00100982, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 1.8725108164876387, + "language_loss": 0.87019694, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89340949, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.664588212966919 + }, + { + "auxiliary_loss_clip": 0.01135025, + "auxiliary_loss_mlp": 0.0116948, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00125146, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.6887221271609762, + "language_loss": 0.77910733, + "learning_rate": 3.922956967452898e-06, + "loss": 0.80215234, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.648016929626465 + }, + { + "auxiliary_loss_clip": 0.01183894, + "auxiliary_loss_mlp": 0.01169502, + "balance_loss_clip": 1.00228739, + "balance_loss_mlp": 1.00146484, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 2.5978682913643962, + "language_loss": 0.77167982, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79521382, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.587463617324829 + }, + { + "auxiliary_loss_clip": 0.011518, + "auxiliary_loss_mlp": 0.01169231, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00100315, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 2.745380678017978, + "language_loss": 0.72221923, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74542952, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.5856759548187256 + }, + { + "auxiliary_loss_clip": 0.01151485, + "auxiliary_loss_mlp": 0.01169704, + "balance_loss_clip": 1.002231, + "balance_loss_mlp": 1.00128496, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5472388130210382, + "language_loss": 0.82097912, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84419096, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.589315414428711 + }, + { + "auxiliary_loss_clip": 0.01135666, + "auxiliary_loss_mlp": 0.01164746, + "balance_loss_clip": 1.00433779, + "balance_loss_mlp": 0.99995106, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7665606123496214, + "language_loss": 0.61043954, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63344365, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.087282180786133 + }, + { + "auxiliary_loss_clip": 0.01105363, + "auxiliary_loss_mlp": 0.00749095, + "balance_loss_clip": 1.00248694, + "balance_loss_mlp": 1.00021815, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.209741223791848, + "language_loss": 0.8617636, + "learning_rate": 3.922420779525586e-06, + "loss": 0.88030821, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.70721435546875 + }, + { + "auxiliary_loss_clip": 0.01119207, + "auxiliary_loss_mlp": 0.01169738, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00112832, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.4771475354344115, + "language_loss": 0.66603112, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.6889205, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.657608985900879 + }, + { + "auxiliary_loss_clip": 0.01184038, + "auxiliary_loss_mlp": 0.011696, + "balance_loss_clip": 1.00238025, + "balance_loss_mlp": 1.00118065, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 2.158002979122694, + "language_loss": 0.76030868, + "learning_rate": 3.922205794037456e-06, + "loss": 0.78384507, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.4960615634918213 + }, + { + "auxiliary_loss_clip": 0.01183866, + "auxiliary_loss_mlp": 0.0116921, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.0008862, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.7619104002973367, + "language_loss": 0.84312874, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86665952, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.5326077938079834 + }, + { + "auxiliary_loss_clip": 0.01151231, + "auxiliary_loss_mlp": 0.01169039, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00100183, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.4559302405947108, + "language_loss": 0.76260042, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78580314, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.6493241786956787 + }, + { + "auxiliary_loss_clip": 0.01184136, + "auxiliary_loss_mlp": 0.01169021, + "balance_loss_clip": 1.00240684, + "balance_loss_mlp": 1.00088847, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 4.215342239907354, + "language_loss": 0.79466641, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81819803, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.540949583053589 + }, + { + "auxiliary_loss_clip": 0.01152301, + "auxiliary_loss_mlp": 0.01169163, + "balance_loss_clip": 1.00232017, + "balance_loss_mlp": 1.00112557, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.4395115665119422, + "language_loss": 0.86281317, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88602781, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.610753297805786 + }, + { + "auxiliary_loss_clip": 0.01150547, + "auxiliary_loss_mlp": 0.01168998, + "balance_loss_clip": 1.00221241, + "balance_loss_mlp": 1.00134254, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.7030727303424524, + "language_loss": 0.75868994, + "learning_rate": 3.921667054809449e-06, + "loss": 0.78188539, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.77681565284729 + }, + { + "auxiliary_loss_clip": 0.01150507, + "auxiliary_loss_mlp": 0.00749105, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.00017679, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.213188289702657, + "language_loss": 0.88680559, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90580177, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.559331178665161 + }, + { + "auxiliary_loss_clip": 0.01167118, + "auxiliary_loss_mlp": 0.01168971, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00131512, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.626540896003952, + "language_loss": 0.68087065, + "learning_rate": 3.921451049000975e-06, + "loss": 0.7042315, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.680294990539551 + }, + { + "auxiliary_loss_clip": 0.01150582, + "auxiliary_loss_mlp": 0.01169079, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00113726, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.938534377074667, + "language_loss": 0.69877148, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72196805, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.7208971977233887 + }, + { + "auxiliary_loss_clip": 0.01167871, + "auxiliary_loss_mlp": 0.01168916, + "balance_loss_clip": 1.00222325, + "balance_loss_mlp": 1.00106895, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.5825844239288716, + "language_loss": 0.83061582, + "learning_rate": 3.921234751746038e-06, + "loss": 0.85398364, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.6356847286224365 + }, + { + "auxiliary_loss_clip": 0.01152136, + "auxiliary_loss_mlp": 0.01169037, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.00119042, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.0592642918251065, + "language_loss": 0.76624209, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78945386, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 4.145294666290283 + }, + { + "auxiliary_loss_clip": 0.01135211, + "auxiliary_loss_mlp": 0.01168604, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00113857, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.1567578027101617, + "language_loss": 0.69203401, + "learning_rate": 3.921018163077448e-06, + "loss": 0.71507215, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 4.095735788345337 + }, + { + "auxiliary_loss_clip": 0.01151232, + "auxiliary_loss_mlp": 0.01169687, + "balance_loss_clip": 1.00227952, + "balance_loss_mlp": 1.00174499, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.7750496057504173, + "language_loss": 0.84454697, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86775619, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.5626003742218018 + }, + { + "auxiliary_loss_clip": 0.01151232, + "auxiliary_loss_mlp": 0.0074901, + "balance_loss_clip": 1.00285006, + "balance_loss_mlp": 1.00029886, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8222669669506183, + "language_loss": 0.65180016, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67080259, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.1883015632629395 + }, + { + "auxiliary_loss_clip": 0.01167069, + "auxiliary_loss_mlp": 0.01168943, + "balance_loss_clip": 1.00220788, + "balance_loss_mlp": 1.00119138, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.5347714782086745, + "language_loss": 0.72176456, + "learning_rate": 3.920692733745835e-06, + "loss": 0.74512464, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.6034793853759766 + }, + { + "auxiliary_loss_clip": 0.01167751, + "auxiliary_loss_mlp": 0.01169171, + "balance_loss_clip": 1.00231707, + "balance_loss_mlp": 1.00132418, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.976729717099028, + "language_loss": 0.76718867, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79055786, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 4.018176317214966 + }, + { + "auxiliary_loss_clip": 0.01118786, + "auxiliary_loss_mlp": 0.01169245, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.00139856, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.93877968101589, + "language_loss": 0.76669157, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78957188, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 4.12971043586731 + }, + { + "auxiliary_loss_clip": 0.01135903, + "auxiliary_loss_mlp": 0.01169173, + "balance_loss_clip": 1.0022316, + "balance_loss_mlp": 1.00132656, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 8.069572713426759, + "language_loss": 0.72307944, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74613011, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.6348049640655518 + }, + { + "auxiliary_loss_clip": 0.01151345, + "auxiliary_loss_mlp": 0.00749065, + "balance_loss_clip": 1.00223446, + "balance_loss_mlp": 1.00027418, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.3529406426430777, + "language_loss": 0.79717934, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81618345, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.562385082244873 + }, + { + "auxiliary_loss_clip": 0.01105414, + "auxiliary_loss_mlp": 0.01168855, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00100827, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.2712437652294373, + "language_loss": 0.86192721, + "learning_rate": 3.920148894924246e-06, + "loss": 0.88466984, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.6487486362457275 + }, + { + "auxiliary_loss_clip": 0.01167845, + "auxiliary_loss_mlp": 0.00749057, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00035739, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.3213264168829144, + "language_loss": 0.77906072, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79822969, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.525461196899414 + }, + { + "auxiliary_loss_clip": 0.01167357, + "auxiliary_loss_mlp": 0.01168603, + "balance_loss_clip": 1.00215125, + "balance_loss_mlp": 1.0012331, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.218969025607208, + "language_loss": 0.80595607, + "learning_rate": 3.91993084968105e-06, + "loss": 0.8293156, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.5727365016937256 + }, + { + "auxiliary_loss_clip": 0.01167216, + "auxiliary_loss_mlp": 0.01168995, + "balance_loss_clip": 1.00225735, + "balance_loss_mlp": 1.00105321, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.3474500405331082, + "language_loss": 0.78528929, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80865145, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.5330452919006348 + }, + { + "auxiliary_loss_clip": 0.01150045, + "auxiliary_loss_mlp": 0.01168833, + "balance_loss_clip": 1.00210917, + "balance_loss_mlp": 1.00108171, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.7743586290499713, + "language_loss": 0.7752673, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79845607, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.5489988327026367 + }, + { + "auxiliary_loss_clip": 0.01150861, + "auxiliary_loss_mlp": 0.01168805, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.0011487, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 1.9082941496098778, + "language_loss": 0.7021845, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72538114, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.572286367416382 + }, + { + "auxiliary_loss_clip": 0.01150716, + "auxiliary_loss_mlp": 0.01169039, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00109673, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.6486956298680586, + "language_loss": 0.80914819, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83234578, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 2.532519578933716 + }, + { + "auxiliary_loss_clip": 0.01167705, + "auxiliary_loss_mlp": 0.00749014, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00021863, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.7991035571052336, + "language_loss": 0.92852592, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94769311, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.580554246902466 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01169249, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00111651, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1214830362254933, + "language_loss": 0.87329191, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89633203, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.613694906234741 + }, + { + "auxiliary_loss_clip": 0.01150432, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00025129, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 1.9294979673312815, + "language_loss": 0.84230024, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86129522, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.610825538635254 + }, + { + "auxiliary_loss_clip": 0.01135848, + "auxiliary_loss_mlp": 0.0116938, + "balance_loss_clip": 1.0024308, + "balance_loss_mlp": 1.00134254, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 2.1713722654451524, + "language_loss": 0.83050394, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85355622, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.6167104244232178 + }, + { + "auxiliary_loss_clip": 0.01183764, + "auxiliary_loss_mlp": 0.0116886, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00101388, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 2.0928010401488897, + "language_loss": 0.74660057, + "learning_rate": 3.918946042768707e-06, + "loss": 0.77012682, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.7336299419403076 + }, + { + "auxiliary_loss_clip": 0.01150609, + "auxiliary_loss_mlp": 0.01168904, + "balance_loss_clip": 1.00225925, + "balance_loss_mlp": 1.00115299, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 2.7948697303513685, + "language_loss": 0.73324275, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75643796, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.564020872116089 + }, + { + "auxiliary_loss_clip": 0.01167088, + "auxiliary_loss_mlp": 0.01168798, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00123739, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.2647296183266414, + "language_loss": 0.88414836, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90750718, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.5142338275909424 + }, + { + "auxiliary_loss_clip": 0.01151083, + "auxiliary_loss_mlp": 0.01168636, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00088477, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.7286219728606285, + "language_loss": 0.67209256, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69528979, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.601696252822876 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01168815, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.0009681, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.2461199738006097, + "language_loss": 0.80915558, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83219886, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.70941162109375 + }, + { + "auxiliary_loss_clip": 0.01166669, + "auxiliary_loss_mlp": 0.01165254, + "balance_loss_clip": 1.00278997, + "balance_loss_mlp": 1.0012219, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8001240143993914, + "language_loss": 0.66100377, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68432301, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.1223554611206055 + }, + { + "auxiliary_loss_clip": 0.01151268, + "auxiliary_loss_mlp": 0.01169185, + "balance_loss_clip": 1.00236917, + "balance_loss_mlp": 1.00114715, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 3.477336622127539, + "language_loss": 0.79599899, + "learning_rate": 3.918286230142327e-06, + "loss": 0.8192035, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.6285274028778076 + }, + { + "auxiliary_loss_clip": 0.0113475, + "auxiliary_loss_mlp": 0.00748946, + "balance_loss_clip": 1.00215662, + "balance_loss_mlp": 1.00016892, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 5.146081446064865, + "language_loss": 0.72313333, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74197024, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.649953842163086 + }, + { + "auxiliary_loss_clip": 0.01134512, + "auxiliary_loss_mlp": 0.01168653, + "balance_loss_clip": 1.00235772, + "balance_loss_mlp": 1.00080621, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.087567159652589, + "language_loss": 0.72374451, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74677622, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.6343586444854736 + }, + { + "auxiliary_loss_clip": 0.01138778, + "auxiliary_loss_mlp": 0.0116863, + "balance_loss_clip": 1.00260735, + "balance_loss_mlp": 1.00097394, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 3.0623293802915934, + "language_loss": 0.78233325, + "learning_rate": 3.917955341761128e-06, + "loss": 0.80540735, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.580578565597534 + }, + { + "auxiliary_loss_clip": 0.01116778, + "auxiliary_loss_mlp": 0.01168286, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00082088, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 3.8639220040068896, + "language_loss": 0.75661647, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77946705, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.613121509552002 + }, + { + "auxiliary_loss_clip": 0.01166968, + "auxiliary_loss_mlp": 0.01168525, + "balance_loss_clip": 1.00229323, + "balance_loss_mlp": 1.00096428, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.7297251256902488, + "language_loss": 0.75301033, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77636528, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.6189944744110107 + }, + { + "auxiliary_loss_clip": 0.0118364, + "auxiliary_loss_mlp": 0.01168981, + "balance_loss_clip": 1.00230265, + "balance_loss_mlp": 1.00113392, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.144143444318656, + "language_loss": 0.74249661, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.7660228, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.493628978729248 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01168755, + "balance_loss_clip": 1.00235677, + "balance_loss_mlp": 1.00119472, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.670922316141947, + "language_loss": 0.73373085, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75675887, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.5964596271514893 + }, + { + "auxiliary_loss_clip": 0.01133617, + "auxiliary_loss_mlp": 0.011684, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00093484, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.8794255902551458, + "language_loss": 0.98628795, + "learning_rate": 3.917402406600525e-06, + "loss": 1.0093081, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.702655076980591 + }, + { + "auxiliary_loss_clip": 0.01150245, + "auxiliary_loss_mlp": 0.01168666, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00101018, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 2.2599180303254625, + "language_loss": 0.86195636, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88514555, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.6126325130462646 + }, + { + "auxiliary_loss_clip": 0.01150595, + "auxiliary_loss_mlp": 0.01168854, + "balance_loss_clip": 1.00221956, + "balance_loss_mlp": 1.00119805, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 2.1536267919825645, + "language_loss": 0.85312909, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87632358, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.614586114883423 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01168364, + "balance_loss_clip": 1.00224125, + "balance_loss_mlp": 1.00109005, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 2.3855939025995423, + "language_loss": 0.85400409, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87720847, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 4.052857160568237 + }, + { + "auxiliary_loss_clip": 0.01116836, + "auxiliary_loss_mlp": 0.01168728, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00107169, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 2.9472941933674566, + "language_loss": 0.76962221, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79247785, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.6026298999786377 + }, + { + "auxiliary_loss_clip": 0.01166921, + "auxiliary_loss_mlp": 0.01168359, + "balance_loss_clip": 1.0022167, + "balance_loss_mlp": 1.00089431, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 2.8285944345888674, + "language_loss": 0.83205277, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 3.943793535232544 + }, + { + "auxiliary_loss_clip": 0.01154722, + "auxiliary_loss_mlp": 0.01168373, + "balance_loss_clip": 1.00228417, + "balance_loss_mlp": 1.00081277, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 7.597996467388941, + "language_loss": 0.74356979, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76680076, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.5481090545654297 + }, + { + "auxiliary_loss_clip": 0.01150976, + "auxiliary_loss_mlp": 0.01168608, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00133371, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 1.9952873390828934, + "language_loss": 0.72119832, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74439418, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.623115301132202 + }, + { + "auxiliary_loss_clip": 0.0115065, + "auxiliary_loss_mlp": 0.0116885, + "balance_loss_clip": 1.00212252, + "balance_loss_mlp": 1.00100362, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 4.3395368503137135, + "language_loss": 0.71759522, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74079025, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 3.9997687339782715 + }, + { + "auxiliary_loss_clip": 0.01166978, + "auxiliary_loss_mlp": 0.01168742, + "balance_loss_clip": 1.00228071, + "balance_loss_mlp": 1.00127709, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.8152023646865887, + "language_loss": 0.81245291, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83581007, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.5740156173706055 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01168766, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.00101483, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7427685300166726, + "language_loss": 0.76121497, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78425348, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 4.029539585113525 + }, + { + "auxiliary_loss_clip": 0.01151848, + "auxiliary_loss_mlp": 0.01164461, + "balance_loss_clip": 1.00275409, + "balance_loss_mlp": 1.00119233, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8576317753130132, + "language_loss": 0.55249202, + "learning_rate": 3.916179551676238e-06, + "loss": 0.5756551, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.1986684799194336 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.0116801, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00102115, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.035081927753151, + "language_loss": 0.78103292, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80404544, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.6014304161071777 + }, + { + "auxiliary_loss_clip": 0.01183556, + "auxiliary_loss_mlp": 0.01168849, + "balance_loss_clip": 1.00232637, + "balance_loss_mlp": 1.00109792, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 2.107266765415168, + "language_loss": 0.79050577, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81402981, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.541191577911377 + }, + { + "auxiliary_loss_clip": 0.01134947, + "auxiliary_loss_mlp": 0.01168436, + "balance_loss_clip": 1.00210762, + "balance_loss_mlp": 1.00106633, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 2.075735881735132, + "language_loss": 0.82908487, + "learning_rate": 3.915844519655208e-06, + "loss": 0.85211873, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.6078133583068848 + }, + { + "auxiliary_loss_clip": 0.01154386, + "auxiliary_loss_mlp": 0.01168306, + "balance_loss_clip": 1.00243247, + "balance_loss_mlp": 1.00103176, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.28734618284391, + "language_loss": 0.88743752, + "learning_rate": 3.915732697011183e-06, + "loss": 0.91066444, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.6320407390594482 + }, + { + "auxiliary_loss_clip": 0.01150347, + "auxiliary_loss_mlp": 0.0116845, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00108004, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 3.222472912544542, + "language_loss": 0.738249, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.761437, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.6036529541015625 + }, + { + "auxiliary_loss_clip": 0.01133611, + "auxiliary_loss_mlp": 0.01168203, + "balance_loss_clip": 1.0019784, + "balance_loss_mlp": 1.0009284, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 2.113487661276458, + "language_loss": 0.87892461, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90194273, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.585489511489868 + }, + { + "auxiliary_loss_clip": 0.01167392, + "auxiliary_loss_mlp": 0.00748974, + "balance_loss_clip": 1.00225556, + "balance_loss_mlp": 1.00011039, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 5.635769447301251, + "language_loss": 0.78450561, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80366927, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.5672929286956787 + }, + { + "auxiliary_loss_clip": 0.01167403, + "auxiliary_loss_mlp": 0.00748895, + "balance_loss_clip": 1.00238919, + "balance_loss_mlp": 1.00008488, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.2517318213470685, + "language_loss": 0.73761547, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75677842, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.5955288410186768 + }, + { + "auxiliary_loss_clip": 0.01183565, + "auxiliary_loss_mlp": 0.01168425, + "balance_loss_clip": 1.00228429, + "balance_loss_mlp": 1.00124574, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.6898372671551645, + "language_loss": 0.74836504, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77188492, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 2.554375648498535 + }, + { + "auxiliary_loss_clip": 0.01151127, + "auxiliary_loss_mlp": 0.0116811, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00102663, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.6782908929424256, + "language_loss": 0.85236293, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87555528, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.621354341506958 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_clip": 1.00215387, + "balance_loss_mlp": 1.00113869, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.132527647044314, + "language_loss": 0.74548286, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76867735, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.610083818435669 + }, + { + "auxiliary_loss_clip": 0.01135163, + "auxiliary_loss_mlp": 0.01168694, + "balance_loss_clip": 1.00248075, + "balance_loss_mlp": 1.00094259, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 3.945633018929015, + "language_loss": 0.78184807, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80488664, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.6525533199310303 + }, + { + "auxiliary_loss_clip": 0.01166698, + "auxiliary_loss_mlp": 0.01168319, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00114024, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 3.4025492487391737, + "language_loss": 0.71913463, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74248475, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.5651912689208984 + }, + { + "auxiliary_loss_clip": 0.01150705, + "auxiliary_loss_mlp": 0.01169114, + "balance_loss_clip": 1.00228977, + "balance_loss_mlp": 1.00117207, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.6508417209076067, + "language_loss": 0.779019, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80221719, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.6281516551971436 + }, + { + "auxiliary_loss_clip": 0.01182832, + "auxiliary_loss_mlp": 0.00748775, + "balance_loss_clip": 1.00257349, + "balance_loss_mlp": 0.99994391, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9324926832661818, + "language_loss": 0.58126068, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60057676, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.904430389404297 + }, + { + "auxiliary_loss_clip": 0.01154928, + "auxiliary_loss_mlp": 0.01168445, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00117087, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.87364008253673, + "language_loss": 0.76508415, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78831792, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.614677667617798 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01168802, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.0014323, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 2.849108605121331, + "language_loss": 0.83355176, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85657519, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.634586811065674 + }, + { + "auxiliary_loss_clip": 0.01183396, + "auxiliary_loss_mlp": 0.011686, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00132549, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.0724400139566104, + "language_loss": 0.84417081, + "learning_rate": 3.91415955422773e-06, + "loss": 0.8676908, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.5235788822174072 + }, + { + "auxiliary_loss_clip": 0.01183384, + "auxiliary_loss_mlp": 0.01168275, + "balance_loss_clip": 1.00224257, + "balance_loss_mlp": 1.0010004, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.8113212869943494, + "language_loss": 0.8418926, + "learning_rate": 3.914046642358844e-06, + "loss": 0.8654092, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.498222589492798 + }, + { + "auxiliary_loss_clip": 0.01151736, + "auxiliary_loss_mlp": 0.00748945, + "balance_loss_clip": 1.00230503, + "balance_loss_mlp": 1.00002503, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.773117431068929, + "language_loss": 0.84407067, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8630774, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.610053539276123 + }, + { + "auxiliary_loss_clip": 0.01135327, + "auxiliary_loss_mlp": 0.01168406, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00113106, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.1919110544312845, + "language_loss": 0.95848656, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98152387, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.6308748722076416 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.0116796, + "balance_loss_clip": 1.00193584, + "balance_loss_mlp": 1.00087643, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.099562747990914, + "language_loss": 0.80471742, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82789958, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.6498637199401855 + }, + { + "auxiliary_loss_clip": 0.01121895, + "auxiliary_loss_mlp": 0.01168381, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00101089, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.2202339958703585, + "language_loss": 0.77265257, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79555535, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.6574904918670654 + }, + { + "auxiliary_loss_clip": 0.01167582, + "auxiliary_loss_mlp": 0.01168402, + "balance_loss_clip": 1.0023942, + "balance_loss_mlp": 1.00112712, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 1.9973344020420798, + "language_loss": 0.87216026, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89552009, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.5578866004943848 + }, + { + "auxiliary_loss_clip": 0.01183261, + "auxiliary_loss_mlp": 0.01168278, + "balance_loss_clip": 1.00220227, + "balance_loss_mlp": 1.00109863, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 1.985974156446985, + "language_loss": 0.69425511, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71777058, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.4915661811828613 + }, + { + "auxiliary_loss_clip": 0.01149905, + "auxiliary_loss_mlp": 0.01168167, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00070226, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 4.513735132465645, + "language_loss": 0.80494469, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82812536, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.5646414756774902 + }, + { + "auxiliary_loss_clip": 0.0116776, + "auxiliary_loss_mlp": 0.0116839, + "balance_loss_clip": 1.00242066, + "balance_loss_mlp": 1.00102067, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 4.868994632287374, + "language_loss": 0.69381726, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71717876, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.5179190635681152 + }, + { + "auxiliary_loss_clip": 0.0113439, + "auxiliary_loss_mlp": 0.01168936, + "balance_loss_clip": 1.00241375, + "balance_loss_mlp": 1.00127983, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.6602205807557384, + "language_loss": 0.72844785, + "learning_rate": 3.91302716991575e-06, + "loss": 0.75148118, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.6621954441070557 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01168828, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00136232, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.9833479764401685, + "language_loss": 0.92268723, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94538009, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.7282555103302 + }, + { + "auxiliary_loss_clip": 0.01155188, + "auxiliary_loss_mlp": 0.01168703, + "balance_loss_clip": 1.00234151, + "balance_loss_mlp": 1.00133324, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 3.8960862643689995, + "language_loss": 0.77890927, + "learning_rate": 3.912799822409549e-06, + "loss": 0.80214822, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 4.032873868942261 + }, + { + "auxiliary_loss_clip": 0.01183259, + "auxiliary_loss_mlp": 0.01167885, + "balance_loss_clip": 1.00224054, + "balance_loss_mlp": 1.00108719, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0491819440692556, + "language_loss": 0.807634, + "learning_rate": 3.912686039853952e-06, + "loss": 0.83114541, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.5266566276550293 + }, + { + "auxiliary_loss_clip": 0.01150938, + "auxiliary_loss_mlp": 0.01168289, + "balance_loss_clip": 1.00231218, + "balance_loss_mlp": 1.00091934, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 4.23683985786764, + "language_loss": 0.85261524, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87580752, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 3.944784641265869 + }, + { + "auxiliary_loss_clip": 0.01134695, + "auxiliary_loss_mlp": 0.01168545, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.00108004, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 5.397802058520756, + "language_loss": 0.85519516, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87822759, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.5819079875946045 + }, + { + "auxiliary_loss_clip": 0.01183136, + "auxiliary_loss_mlp": 0.01168226, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00123811, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 1.9824778739226074, + "language_loss": 0.72074485, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74425852, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.5703043937683105 + }, + { + "auxiliary_loss_clip": 0.01149801, + "auxiliary_loss_mlp": 0.01167904, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00091577, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.2459324748399676, + "language_loss": 0.76308024, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78625727, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 4.018807411193848 + }, + { + "auxiliary_loss_clip": 0.01155026, + "auxiliary_loss_mlp": 0.01168367, + "balance_loss_clip": 1.00222671, + "balance_loss_mlp": 1.00118828, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.0109691297209755, + "language_loss": 0.89120317, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91443706, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 3.9906198978424072 + }, + { + "auxiliary_loss_clip": 0.01150577, + "auxiliary_loss_mlp": 0.01168324, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.0012399, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.5406089151039262, + "language_loss": 0.75522268, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77841169, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.649956226348877 + }, + { + "auxiliary_loss_clip": 0.0112078, + "auxiliary_loss_mlp": 0.01168115, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.001127, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.214372786550298, + "language_loss": 0.77404273, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79693168, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.661025047302246 + }, + { + "auxiliary_loss_clip": 0.01166712, + "auxiliary_loss_mlp": 0.01168059, + "balance_loss_clip": 1.00223064, + "balance_loss_mlp": 1.0010705, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.831118842208432, + "language_loss": 0.79437679, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81772447, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.538266181945801 + }, + { + "auxiliary_loss_clip": 0.01183194, + "auxiliary_loss_mlp": 0.01167782, + "balance_loss_clip": 1.00222778, + "balance_loss_mlp": 1.00098467, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 2.0808702702924315, + "language_loss": 0.74802327, + "learning_rate": 3.911658733556155e-06, + "loss": 0.77153301, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.525080442428589 + }, + { + "auxiliary_loss_clip": 0.01183422, + "auxiliary_loss_mlp": 0.01167841, + "balance_loss_clip": 1.00246716, + "balance_loss_mlp": 1.00094783, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 3.5379536079488036, + "language_loss": 0.75179696, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77530962, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.4964346885681152 + }, + { + "auxiliary_loss_clip": 0.01166384, + "auxiliary_loss_mlp": 0.01167684, + "balance_loss_clip": 1.00201356, + "balance_loss_mlp": 1.00088644, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.5316378174060192, + "language_loss": 0.8917678, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91510844, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.57205867767334 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01168123, + "balance_loss_clip": 1.00247681, + "balance_loss_mlp": 1.00103962, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.5312879576792557, + "language_loss": 0.65447199, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67770118, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.56666898727417 + }, + { + "auxiliary_loss_clip": 0.01150183, + "auxiliary_loss_mlp": 0.01168325, + "balance_loss_clip": 1.00204861, + "balance_loss_mlp": 1.00152755, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.5765039645953791, + "language_loss": 0.76340497, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78658998, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.578146457672119 + }, + { + "auxiliary_loss_clip": 0.01183376, + "auxiliary_loss_mlp": 0.01168323, + "balance_loss_clip": 1.00227547, + "balance_loss_mlp": 1.00123966, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8606948892863358, + "language_loss": 0.7188831, + "learning_rate": 3.911085470472892e-06, + "loss": 0.74240011, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.4971227645874023 + }, + { + "auxiliary_loss_clip": 0.01149857, + "auxiliary_loss_mlp": 0.011681, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00130248, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 2.379924161535515, + "language_loss": 0.83484334, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85802293, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.567133903503418 + }, + { + "auxiliary_loss_clip": 0.0115069, + "auxiliary_loss_mlp": 0.01168242, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00115848, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 4.496262057421367, + "language_loss": 0.80361414, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82680345, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.613081216812134 + }, + { + "auxiliary_loss_clip": 0.01167153, + "auxiliary_loss_mlp": 0.00748812, + "balance_loss_clip": 1.00294423, + "balance_loss_mlp": 1.00003362, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8173667102130318, + "language_loss": 0.58582938, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60498905, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 3.0794243812561035 + }, + { + "auxiliary_loss_clip": 0.01135227, + "auxiliary_loss_mlp": 0.01168038, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00114512, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.5546004777286595, + "language_loss": 0.80887038, + "learning_rate": 3.910625555546292e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.604138135910034 + }, + { + "auxiliary_loss_clip": 0.01149993, + "auxiliary_loss_mlp": 0.0116778, + "balance_loss_clip": 1.00200224, + "balance_loss_mlp": 1.00107753, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8365273560761741, + "language_loss": 0.83453274, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85771048, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.590688943862915 + }, + { + "auxiliary_loss_clip": 0.01151944, + "auxiliary_loss_mlp": 0.01168248, + "balance_loss_clip": 1.00228667, + "balance_loss_mlp": 1.00097334, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7144145344491595, + "language_loss": 0.67670822, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.6999101, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.5778636932373047 + }, + { + "auxiliary_loss_clip": 0.01133568, + "auxiliary_loss_mlp": 0.01168277, + "balance_loss_clip": 1.00195467, + "balance_loss_mlp": 1.00119328, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.6956468587864724, + "language_loss": 0.81839114, + "learning_rate": 3.910279858599409e-06, + "loss": 0.84140962, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.5971643924713135 + }, + { + "auxiliary_loss_clip": 0.01153641, + "auxiliary_loss_mlp": 0.01167505, + "balance_loss_clip": 1.00218141, + "balance_loss_mlp": 1.00099373, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.7759740056026139, + "language_loss": 0.80526, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82847148, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.5532405376434326 + }, + { + "auxiliary_loss_clip": 0.01116451, + "auxiliary_loss_mlp": 0.01168134, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00124097, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.960087677660296, + "language_loss": 0.78454041, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80738628, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.725771188735962 + }, + { + "auxiliary_loss_clip": 0.01166357, + "auxiliary_loss_mlp": 0.01168164, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00136638, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.5531854710460657, + "language_loss": 0.68023318, + "learning_rate": 3.90993350971051e-06, + "loss": 0.70357835, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.5589511394500732 + }, + { + "auxiliary_loss_clip": 0.01183232, + "auxiliary_loss_mlp": 0.01168169, + "balance_loss_clip": 1.00235009, + "balance_loss_mlp": 1.00127637, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 3.506596839329931, + "language_loss": 0.72615093, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74966496, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.5320122241973877 + }, + { + "auxiliary_loss_clip": 0.01166598, + "auxiliary_loss_mlp": 0.0116823, + "balance_loss_clip": 1.00219846, + "balance_loss_mlp": 1.00105119, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6702321225571721, + "language_loss": 0.77072495, + "learning_rate": 3.909702248319597e-06, + "loss": 0.79407322, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.572923421859741 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01167964, + "balance_loss_clip": 1.00216222, + "balance_loss_mlp": 1.0010705, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.0114133273586914, + "language_loss": 0.85592961, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87910843, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.5961532592773438 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01168026, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00103796, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.8758094209216192, + "language_loss": 0.75345427, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77629805, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.661642074584961 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01167908, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.0009197, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.183148369466593, + "language_loss": 0.80954123, + "learning_rate": 3.909354813123452e-06, + "loss": 0.8325569, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.6311419010162354 + }, + { + "auxiliary_loss_clip": 0.01183187, + "auxiliary_loss_mlp": 0.00748916, + "balance_loss_clip": 1.00238943, + "balance_loss_mlp": 1.00007081, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.849530961036989, + "language_loss": 0.80384594, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82316697, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.545771598815918 + }, + { + "auxiliary_loss_clip": 0.01167007, + "auxiliary_loss_mlp": 0.0116819, + "balance_loss_clip": 1.00226581, + "balance_loss_mlp": 1.00101089, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 3.5001527586377157, + "language_loss": 0.74234951, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76570153, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.559803009033203 + }, + { + "auxiliary_loss_clip": 0.01183125, + "auxiliary_loss_mlp": 0.00748948, + "balance_loss_clip": 1.00223064, + "balance_loss_mlp": 1.00020432, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.530223380495382, + "language_loss": 0.74173939, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76106012, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.724501371383667 + }, + { + "auxiliary_loss_clip": 0.01150625, + "auxiliary_loss_mlp": 0.01167637, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00103021, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 1.8389356418307672, + "language_loss": 0.85043883, + "learning_rate": 3.908890552574849e-06, + "loss": 0.8736214, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.629347085952759 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01168493, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00140905, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.0995827394169577, + "language_loss": 0.77917063, + "learning_rate": 3.908774306463384e-06, + "loss": 0.80202663, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.704392671585083 + }, + { + "auxiliary_loss_clip": 0.01166275, + "auxiliary_loss_mlp": 0.01167933, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00123048, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 2.444590493939918, + "language_loss": 0.83128434, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85462642, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.555868148803711 + }, + { + "auxiliary_loss_clip": 0.01150761, + "auxiliary_loss_mlp": 0.0116838, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00120139, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.7311314638297446, + "language_loss": 0.78397375, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80716515, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 4.104424238204956 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.01168382, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.00110745, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.430281698785055, + "language_loss": 0.83406872, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85726094, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.5533289909362793 + }, + { + "auxiliary_loss_clip": 0.01133889, + "auxiliary_loss_mlp": 0.01168415, + "balance_loss_clip": 1.00217915, + "balance_loss_mlp": 1.00142622, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 3.112251772221972, + "language_loss": 0.81444651, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83746958, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.588773250579834 + }, + { + "auxiliary_loss_clip": 0.01153953, + "auxiliary_loss_mlp": 0.01168496, + "balance_loss_clip": 1.00214016, + "balance_loss_mlp": 1.00122201, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 1.9415788903774165, + "language_loss": 0.86376989, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.8869943, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 3.964498519897461 + }, + { + "auxiliary_loss_clip": 0.01166409, + "auxiliary_loss_mlp": 0.01167849, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00105143, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 1.8070565871249944, + "language_loss": 0.85271573, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87605834, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.5246617794036865 + }, + { + "auxiliary_loss_clip": 0.01135181, + "auxiliary_loss_mlp": 0.01168217, + "balance_loss_clip": 1.00218785, + "balance_loss_mlp": 1.0010376, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 1.7491504382635845, + "language_loss": 0.78754127, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81057531, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.5965328216552734 + }, + { + "auxiliary_loss_clip": 0.01116998, + "auxiliary_loss_mlp": 0.01168307, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00131905, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.0173002270871736, + "language_loss": 0.79314792, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81600106, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 4.04639196395874 + }, + { + "auxiliary_loss_clip": 0.01149595, + "auxiliary_loss_mlp": 0.01168481, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00158834, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.0852290519254506, + "language_loss": 0.92933047, + "learning_rate": 3.907724834849002e-06, + "loss": 0.95251125, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 4.02419900894165 + }, + { + "auxiliary_loss_clip": 0.01150199, + "auxiliary_loss_mlp": 0.01168034, + "balance_loss_clip": 1.00214171, + "balance_loss_mlp": 1.00104547, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7465911896502855, + "language_loss": 0.80811524, + "learning_rate": 3.907607865127225e-06, + "loss": 0.83129764, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.617155075073242 + }, + { + "auxiliary_loss_clip": 0.01115919, + "auxiliary_loss_mlp": 0.01164181, + "balance_loss_clip": 1.00233603, + "balance_loss_mlp": 1.00091171, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8833294183566303, + "language_loss": 0.63285792, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65565884, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.2014219760894775 + }, + { + "auxiliary_loss_clip": 0.01118223, + "auxiliary_loss_mlp": 0.01168645, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00137091, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9636163737001788, + "language_loss": 0.93557215, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95844078, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.698270797729492 + }, + { + "auxiliary_loss_clip": 0.01166923, + "auxiliary_loss_mlp": 0.01168392, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00149941, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.7879024672288653, + "language_loss": 0.8135823, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83693546, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.5443685054779053 + }, + { + "auxiliary_loss_clip": 0.01121495, + "auxiliary_loss_mlp": 0.01168384, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00139523, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.560387894284774, + "language_loss": 0.77565408, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79855287, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.701854944229126 + }, + { + "auxiliary_loss_clip": 0.01167542, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_clip": 1.00251365, + "balance_loss_mlp": 1.00125241, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.0386357160226583, + "language_loss": 0.81079799, + "learning_rate": 3.907021931556922e-06, + "loss": 0.8341558, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.6174330711364746 + }, + { + "auxiliary_loss_clip": 0.01166408, + "auxiliary_loss_mlp": 0.01168125, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.00123167, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.7471701106746134, + "language_loss": 0.78350097, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80684626, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.7232913970947266 + }, + { + "auxiliary_loss_clip": 0.01150691, + "auxiliary_loss_mlp": 0.01168435, + "balance_loss_clip": 1.00237155, + "balance_loss_mlp": 1.00154161, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 1.7655906147006168, + "language_loss": 0.75216049, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77535176, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.7017900943756104 + }, + { + "auxiliary_loss_clip": 0.01105324, + "auxiliary_loss_mlp": 0.01167611, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00119495, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.094299233944427, + "language_loss": 0.90741283, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93014222, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.678058385848999 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01168398, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00121927, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.7282148398823463, + "language_loss": 0.84082091, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86351955, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.6992781162261963 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.01168043, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.00105453, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 1.9477929407315355, + "language_loss": 0.73669517, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.75956446, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 2.6536834239959717 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.01167913, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00121117, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.8159778543511151, + "language_loss": 0.75737691, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78023928, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.6556808948516846 + }, + { + "auxiliary_loss_clip": 0.01166053, + "auxiliary_loss_mlp": 0.01168374, + "balance_loss_clip": 1.00223005, + "balance_loss_mlp": 1.00148129, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 1.9811741628381911, + "language_loss": 0.82729316, + "learning_rate": 3.906198587476043e-06, + "loss": 0.85063744, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 2.5194385051727295 + }, + { + "auxiliary_loss_clip": 0.01150627, + "auxiliary_loss_mlp": 0.0116824, + "balance_loss_clip": 1.00217474, + "balance_loss_mlp": 1.00134683, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.7951834325553386, + "language_loss": 0.75022364, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77341229, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.01166957, + "auxiliary_loss_mlp": 0.01168746, + "balance_loss_clip": 1.00239682, + "balance_loss_mlp": 1.00137639, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.3703371752612195, + "language_loss": 0.83440709, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85776412, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.591325044631958 + }, + { + "auxiliary_loss_clip": 0.01166384, + "auxiliary_loss_mlp": 0.01168028, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.00123, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.2207001451934496, + "language_loss": 0.85144836, + "learning_rate": 3.9058446413892e-06, + "loss": 0.87479246, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.509626865386963 + }, + { + "auxiliary_loss_clip": 0.0116629, + "auxiliary_loss_mlp": 0.01167987, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.00118971, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.768369160026649, + "language_loss": 0.77075702, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79409981, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.505614995956421 + }, + { + "auxiliary_loss_clip": 0.0115448, + "auxiliary_loss_mlp": 0.01168707, + "balance_loss_clip": 1.00283706, + "balance_loss_mlp": 1.00133705, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.270053280243736, + "language_loss": 0.79287833, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81611025, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.551146984100342 + }, + { + "auxiliary_loss_clip": 0.01149769, + "auxiliary_loss_mlp": 0.01168347, + "balance_loss_clip": 1.00214005, + "balance_loss_mlp": 1.00107241, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.1936577838952678, + "language_loss": 0.89859474, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92177594, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.565836191177368 + }, + { + "auxiliary_loss_clip": 0.01135041, + "auxiliary_loss_mlp": 0.01168425, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00134134, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 3.322224957242783, + "language_loss": 0.80078495, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82381964, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.6535282135009766 + }, + { + "auxiliary_loss_clip": 0.01182793, + "auxiliary_loss_mlp": 0.01167658, + "balance_loss_clip": 1.00225401, + "balance_loss_mlp": 1.00095582, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.8984611600175032, + "language_loss": 0.88164794, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90515244, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.527735710144043 + }, + { + "auxiliary_loss_clip": 0.0115062, + "auxiliary_loss_mlp": 0.01168114, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00112534, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.020177940104947, + "language_loss": 0.8748883, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89807558, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.5838050842285156 + }, + { + "auxiliary_loss_clip": 0.01149578, + "auxiliary_loss_mlp": 0.01168348, + "balance_loss_clip": 1.00230002, + "balance_loss_mlp": 1.00126457, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 2.1923406736044906, + "language_loss": 0.73863828, + "learning_rate": 3.905016237952136e-06, + "loss": 0.76181757, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.583409309387207 + }, + { + "auxiliary_loss_clip": 0.01166542, + "auxiliary_loss_mlp": 0.01163622, + "balance_loss_clip": 1.00304413, + "balance_loss_mlp": 1.0003531, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7844163556805352, + "language_loss": 0.61744612, + "learning_rate": 3.904897605614418e-06, + "loss": 0.64074773, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.0993411540985107 + }, + { + "auxiliary_loss_clip": 0.01150026, + "auxiliary_loss_mlp": 0.01168146, + "balance_loss_clip": 1.00206339, + "balance_loss_mlp": 1.00115752, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 3.1966083417730395, + "language_loss": 0.78230017, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80548191, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.6036136150360107 + }, + { + "auxiliary_loss_clip": 0.011491, + "auxiliary_loss_mlp": 0.01163793, + "balance_loss_clip": 1.0028789, + "balance_loss_mlp": 1.00052381, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7511447566415209, + "language_loss": 0.59388471, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61701369, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.0574910640716553 + }, + { + "auxiliary_loss_clip": 0.01166302, + "auxiliary_loss_mlp": 0.01167778, + "balance_loss_clip": 1.00232017, + "balance_loss_mlp": 1.00126624, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.9790118448886915, + "language_loss": 0.63909483, + "learning_rate": 3.904541275215825e-06, + "loss": 0.66243559, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.7070727348327637 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01168551, + "balance_loss_clip": 1.0023402, + "balance_loss_mlp": 1.00146723, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 1.8878525066054115, + "language_loss": 0.80551803, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82870924, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.5722501277923584 + }, + { + "auxiliary_loss_clip": 0.01167229, + "auxiliary_loss_mlp": 0.01168399, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00150633, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7212637916849602, + "language_loss": 0.76196969, + "learning_rate": 3.904303360507276e-06, + "loss": 0.785326, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.56919527053833 + }, + { + "auxiliary_loss_clip": 0.01133905, + "auxiliary_loss_mlp": 0.01167673, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00116134, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.6741396858962103, + "language_loss": 0.76675308, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.78976882, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.862481117248535 + }, + { + "auxiliary_loss_clip": 0.01150082, + "auxiliary_loss_mlp": 0.01167881, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00127435, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.230709525419184, + "language_loss": 0.82991314, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85309273, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 4.0232179164886475 + }, + { + "auxiliary_loss_clip": 0.0116678, + "auxiliary_loss_mlp": 0.01167971, + "balance_loss_clip": 1.00233698, + "balance_loss_mlp": 1.00107765, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.95410478287823, + "language_loss": 0.76109684, + "learning_rate": 3.903945946870439e-06, + "loss": 0.78444439, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.635061502456665 + }, + { + "auxiliary_loss_clip": 0.0116613, + "auxiliary_loss_mlp": 0.01167961, + "balance_loss_clip": 1.002267, + "balance_loss_mlp": 1.00135398, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 1.9021209505854526, + "language_loss": 0.87367451, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89701545, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 3.992464065551758 + }, + { + "auxiliary_loss_clip": 0.01117805, + "auxiliary_loss_mlp": 0.01168001, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.0013938, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 2.195828187362524, + "language_loss": 0.69612813, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71898615, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.6670608520507812 + }, + { + "auxiliary_loss_clip": 0.01154696, + "auxiliary_loss_mlp": 0.01167987, + "balance_loss_clip": 1.00232041, + "balance_loss_mlp": 1.00118959, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.4909394957873734, + "language_loss": 0.81916136, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84238821, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.627096176147461 + }, + { + "auxiliary_loss_clip": 0.01150269, + "auxiliary_loss_mlp": 0.01168236, + "balance_loss_clip": 1.00214875, + "balance_loss_mlp": 1.00134349, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.9254908295664084, + "language_loss": 0.80298907, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82617414, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.6283810138702393 + }, + { + "auxiliary_loss_clip": 0.0118203, + "auxiliary_loss_mlp": 0.01162519, + "balance_loss_clip": 1.0029583, + "balance_loss_mlp": 1.00001323, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7040473511933106, + "language_loss": 0.57115412, + "learning_rate": 3.903348813579662e-06, + "loss": 0.5945996, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 4.610133171081543 + }, + { + "auxiliary_loss_clip": 0.01133973, + "auxiliary_loss_mlp": 0.01167774, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00126231, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 2.219663021788527, + "language_loss": 0.93856859, + "learning_rate": 3.903229170377845e-06, + "loss": 0.961586, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 3.998288631439209 + }, + { + "auxiliary_loss_clip": 0.01166971, + "auxiliary_loss_mlp": 0.01167507, + "balance_loss_clip": 1.00229955, + "balance_loss_mlp": 1.00080442, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.6784520639305436, + "language_loss": 0.77609277, + "learning_rate": 3.903109455005387e-06, + "loss": 0.79943752, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.581845998764038 + }, + { + "auxiliary_loss_clip": 0.01134078, + "auxiliary_loss_mlp": 0.01168013, + "balance_loss_clip": 1.00230813, + "balance_loss_mlp": 1.00131059, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.7902201308239118, + "language_loss": 0.81437135, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83739227, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.651249408721924 + }, + { + "auxiliary_loss_clip": 0.0117055, + "auxiliary_loss_mlp": 0.01168217, + "balance_loss_clip": 1.00252974, + "balance_loss_mlp": 1.00122833, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 1.8598967051751794, + "language_loss": 0.82920527, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85259295, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.5624806880950928 + }, + { + "auxiliary_loss_clip": 0.01121399, + "auxiliary_loss_mlp": 0.01167847, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00124037, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.7320804491900703, + "language_loss": 0.73854798, + "learning_rate": 3.902749875909578e-06, + "loss": 0.76144046, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.682131290435791 + }, + { + "auxiliary_loss_clip": 0.01182752, + "auxiliary_loss_mlp": 0.0116747, + "balance_loss_clip": 1.00235856, + "balance_loss_mlp": 1.00095832, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.144746117007335, + "language_loss": 0.79226261, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81576484, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.5042564868927 + }, + { + "auxiliary_loss_clip": 0.01182902, + "auxiliary_loss_mlp": 0.01167734, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00103188, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.312324238731739, + "language_loss": 0.76137066, + "learning_rate": 3.902509795742467e-06, + "loss": 0.78487706, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.5148355960845947 + }, + { + "auxiliary_loss_clip": 0.01135582, + "auxiliary_loss_mlp": 0.01167765, + "balance_loss_clip": 1.00239646, + "balance_loss_mlp": 1.001158, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.9252114477449256, + "language_loss": 0.83174831, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85478187, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.6118717193603516 + }, + { + "auxiliary_loss_clip": 0.01151093, + "auxiliary_loss_mlp": 0.00749186, + "balance_loss_clip": 1.00225365, + "balance_loss_mlp": 1.00050068, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.6420314739949626, + "language_loss": 0.78566098, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80466384, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.6298298835754395 + }, + { + "auxiliary_loss_clip": 0.01116792, + "auxiliary_loss_mlp": 0.01168061, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00145435, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.345201965490564, + "language_loss": 0.77255392, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79540253, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.730015516281128 + }, + { + "auxiliary_loss_clip": 0.01134377, + "auxiliary_loss_mlp": 0.01167626, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00140035, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.935768813354019, + "language_loss": 0.85938382, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88240385, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 2.6583380699157715 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01168112, + "balance_loss_clip": 1.00214612, + "balance_loss_mlp": 1.00169623, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 1.9315670696434206, + "language_loss": 0.74452436, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76755214, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.5867061614990234 + }, + { + "auxiliary_loss_clip": 0.01166004, + "auxiliary_loss_mlp": 0.01167509, + "balance_loss_clip": 1.00225329, + "balance_loss_mlp": 1.00128353, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 2.0259944965480825, + "language_loss": 0.83728766, + "learning_rate": 3.901787823946341e-06, + "loss": 0.86062276, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.5224368572235107 + }, + { + "auxiliary_loss_clip": 0.01150522, + "auxiliary_loss_mlp": 0.01167701, + "balance_loss_clip": 1.00221086, + "balance_loss_mlp": 1.00147581, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 2.3124867295812552, + "language_loss": 0.86712843, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89031065, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.648564100265503 + }, + { + "auxiliary_loss_clip": 0.01150685, + "auxiliary_loss_mlp": 0.00749092, + "balance_loss_clip": 1.0021863, + "balance_loss_mlp": 1.00051999, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.6488338570239727, + "language_loss": 0.70814246, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72714019, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.6855623722076416 + }, + { + "auxiliary_loss_clip": 0.0115109, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00122011, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.254169864602963, + "language_loss": 0.86786294, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89104921, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 2.540550947189331 + }, + { + "auxiliary_loss_clip": 0.01166427, + "auxiliary_loss_mlp": 0.01167354, + "balance_loss_clip": 1.00219309, + "balance_loss_mlp": 1.00112879, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.8036656515387288, + "language_loss": 0.87672222, + "learning_rate": 3.901305067035068e-06, + "loss": 0.90006006, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.528763771057129 + }, + { + "auxiliary_loss_clip": 0.01167179, + "auxiliary_loss_mlp": 0.00749131, + "balance_loss_clip": 1.00238502, + "balance_loss_mlp": 1.00048363, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.0941401561552855, + "language_loss": 0.87896931, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89813244, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.4937145709991455 + }, + { + "auxiliary_loss_clip": 0.01182707, + "auxiliary_loss_mlp": 0.01167291, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.00097013, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 2.2099548910765052, + "language_loss": 0.76044393, + "learning_rate": 3.901063255975046e-06, + "loss": 0.78394389, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.516254186630249 + }, + { + "auxiliary_loss_clip": 0.01121977, + "auxiliary_loss_mlp": 0.01167838, + "balance_loss_clip": 1.00240946, + "balance_loss_mlp": 1.00132632, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.9208451233149946, + "language_loss": 0.825688, + "learning_rate": 3.900942242309978e-06, + "loss": 0.84858608, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.659658908843994 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.0116763, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00102353, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.95141211036613, + "language_loss": 0.79080462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81398135, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.5641653537750244 + }, + { + "auxiliary_loss_clip": 0.01182676, + "auxiliary_loss_mlp": 0.01167877, + "balance_loss_clip": 1.00232387, + "balance_loss_mlp": 1.00136507, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.6384682931390502, + "language_loss": 0.79551351, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81901908, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.5074636936187744 + }, + { + "auxiliary_loss_clip": 0.01166584, + "auxiliary_loss_mlp": 0.00749194, + "balance_loss_clip": 1.0022769, + "balance_loss_mlp": 1.00053203, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.249049753460363, + "language_loss": 0.7585578, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77771556, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.6016883850097656 + }, + { + "auxiliary_loss_clip": 0.01166954, + "auxiliary_loss_mlp": 0.00749226, + "balance_loss_clip": 1.00224757, + "balance_loss_mlp": 1.00067103, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.47959502708606, + "language_loss": 0.78144312, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80060494, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.5969736576080322 + }, + { + "auxiliary_loss_clip": 0.01133512, + "auxiliary_loss_mlp": 0.01167789, + "balance_loss_clip": 1.00216687, + "balance_loss_mlp": 1.00118232, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.78721058480991, + "language_loss": 0.69185835, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71487135, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.7997119426727295 + }, + { + "auxiliary_loss_clip": 0.01116722, + "auxiliary_loss_mlp": 0.00749089, + "balance_loss_clip": 1.00252616, + "balance_loss_mlp": 1.00042713, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8591021651936204, + "language_loss": 0.62840009, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64705825, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.2842581272125244 + }, + { + "auxiliary_loss_clip": 0.01167255, + "auxiliary_loss_mlp": 0.01167624, + "balance_loss_clip": 1.0022763, + "balance_loss_mlp": 1.00101733, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.5895556665827684, + "language_loss": 0.77584481, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79919362, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.5194146633148193 + }, + { + "auxiliary_loss_clip": 0.01117651, + "auxiliary_loss_mlp": 0.01167838, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00113583, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.234403023303668, + "language_loss": 0.79484594, + "learning_rate": 3.899971538354343e-06, + "loss": 0.8177008, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.6587581634521484 + }, + { + "auxiliary_loss_clip": 0.01154524, + "auxiliary_loss_mlp": 0.01167693, + "balance_loss_clip": 1.00248921, + "balance_loss_mlp": 1.00099027, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.8168477556595184, + "language_loss": 0.70684403, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73006618, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.5851292610168457 + }, + { + "auxiliary_loss_clip": 0.01101158, + "auxiliary_loss_mlp": 0.0116793, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00141835, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.456302258318422, + "language_loss": 0.72191155, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74460244, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.79764986038208 + }, + { + "auxiliary_loss_clip": 0.01122713, + "auxiliary_loss_mlp": 0.01167622, + "balance_loss_clip": 1.00234973, + "balance_loss_mlp": 1.00120544, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1462413301204233, + "language_loss": 0.8201282, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84303159, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.6570487022399902 + }, + { + "auxiliary_loss_clip": 0.01166725, + "auxiliary_loss_mlp": 0.01168211, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00122261, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 4.781182669022958, + "language_loss": 0.80099225, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82434154, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 3.979722023010254 + }, + { + "auxiliary_loss_clip": 0.01166344, + "auxiliary_loss_mlp": 0.01167385, + "balance_loss_clip": 1.00218117, + "balance_loss_mlp": 1.00096846, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 3.1405884314322625, + "language_loss": 0.82931137, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85264862, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.527104139328003 + }, + { + "auxiliary_loss_clip": 0.0115015, + "auxiliary_loss_mlp": 0.011679, + "balance_loss_clip": 1.00203693, + "balance_loss_mlp": 1.00119781, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.284297350569556, + "language_loss": 0.77500284, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79818332, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 3.922440767288208 + }, + { + "auxiliary_loss_clip": 0.0111608, + "auxiliary_loss_mlp": 0.01162886, + "balance_loss_clip": 1.00224411, + "balance_loss_mlp": 1.00037992, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9002012278688108, + "language_loss": 0.59236622, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61515582, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.356398582458496 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.01167868, + "balance_loss_clip": 1.00211632, + "balance_loss_mlp": 1.00145173, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.286813428237919, + "language_loss": 0.82413781, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84747678, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.5089080333709717 + }, + { + "auxiliary_loss_clip": 0.01166297, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_clip": 1.00228941, + "balance_loss_mlp": 1.00133014, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.787467076231621, + "language_loss": 0.78923607, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81258029, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 3.936617612838745 + }, + { + "auxiliary_loss_clip": 0.01133411, + "auxiliary_loss_mlp": 0.01167478, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00134814, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1948414635490625, + "language_loss": 0.85538995, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.8783989, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 4.036622047424316 + }, + { + "auxiliary_loss_clip": 0.01150212, + "auxiliary_loss_mlp": 0.01167273, + "balance_loss_clip": 1.00207484, + "balance_loss_mlp": 1.0012387, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 1.8860240704163258, + "language_loss": 0.85986936, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88304412, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.561133623123169 + }, + { + "auxiliary_loss_clip": 0.01150223, + "auxiliary_loss_mlp": 0.01167815, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00120807, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 1.765556422624398, + "language_loss": 0.67988753, + "learning_rate": 3.898506837508518e-06, + "loss": 0.70306796, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.625727415084839 + }, + { + "auxiliary_loss_clip": 0.01166354, + "auxiliary_loss_mlp": 0.00749274, + "balance_loss_clip": 1.00224149, + "balance_loss_mlp": 1.00070715, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.327885178530411, + "language_loss": 0.83127993, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85043621, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.5847983360290527 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.00749265, + "balance_loss_clip": 1.0025202, + "balance_loss_mlp": 1.00076604, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.608786619590552, + "language_loss": 0.81736928, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83669192, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.5394821166992188 + }, + { + "auxiliary_loss_clip": 0.01150506, + "auxiliary_loss_mlp": 0.01168186, + "balance_loss_clip": 1.00216508, + "balance_loss_mlp": 1.00119734, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 2.2147359302969374, + "language_loss": 0.78483844, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80802542, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.573092222213745 + }, + { + "auxiliary_loss_clip": 0.01182821, + "auxiliary_loss_mlp": 0.01167583, + "balance_loss_clip": 1.00236964, + "balance_loss_mlp": 1.0011673, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 1.979123142102751, + "language_loss": 0.82418507, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84768915, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.481043815612793 + }, + { + "auxiliary_loss_clip": 0.01149691, + "auxiliary_loss_mlp": 0.01167772, + "balance_loss_clip": 1.00208628, + "balance_loss_mlp": 1.00126076, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.1202568249066105, + "language_loss": 0.7160995, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73927414, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.6875171661376953 + }, + { + "auxiliary_loss_clip": 0.01149558, + "auxiliary_loss_mlp": 0.01167893, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00128615, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.9420065363992287, + "language_loss": 0.71497369, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73814821, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.5919904708862305 + }, + { + "auxiliary_loss_clip": 0.0118265, + "auxiliary_loss_mlp": 0.01167482, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00116074, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.7241367322598593, + "language_loss": 0.79133713, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81483841, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.513518810272217 + }, + { + "auxiliary_loss_clip": 0.01166933, + "auxiliary_loss_mlp": 0.01167416, + "balance_loss_clip": 1.00233042, + "balance_loss_mlp": 1.00109565, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 4.964597219075923, + "language_loss": 0.76093411, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78427756, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.6077980995178223 + }, + { + "auxiliary_loss_clip": 0.01166172, + "auxiliary_loss_mlp": 0.01167745, + "balance_loss_clip": 1.00236416, + "balance_loss_mlp": 1.00123382, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 4.306578231646865, + "language_loss": 0.70973599, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.7330752, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.526486396789551 + }, + { + "auxiliary_loss_clip": 0.01182751, + "auxiliary_loss_mlp": 0.01167576, + "balance_loss_clip": 1.00239491, + "balance_loss_mlp": 1.0010649, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 2.072975012781261, + "language_loss": 0.84455907, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86806238, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.4863948822021484 + }, + { + "auxiliary_loss_clip": 0.01166844, + "auxiliary_loss_mlp": 0.01167668, + "balance_loss_clip": 1.00232399, + "balance_loss_mlp": 1.00115669, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.6859800464607815, + "language_loss": 0.78699851, + "learning_rate": 3.897155087940906e-06, + "loss": 0.81034356, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.564303398132324 + }, + { + "auxiliary_loss_clip": 0.01133966, + "auxiliary_loss_mlp": 0.00749182, + "balance_loss_clip": 1.00237966, + "balance_loss_mlp": 1.00062585, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.5918631092125293, + "language_loss": 0.80359387, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82242531, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.682530164718628 + }, + { + "auxiliary_loss_clip": 0.0116704, + "auxiliary_loss_mlp": 0.01167931, + "balance_loss_clip": 1.00238097, + "balance_loss_mlp": 1.00122857, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.6050052704056177, + "language_loss": 0.83840638, + "learning_rate": 3.896908379886188e-06, + "loss": 0.86175609, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.544013500213623 + }, + { + "auxiliary_loss_clip": 0.01166452, + "auxiliary_loss_mlp": 0.01167801, + "balance_loss_clip": 1.00221241, + "balance_loss_mlp": 1.0013845, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.9699697067286315, + "language_loss": 0.75770682, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78104937, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.562178134918213 + }, + { + "auxiliary_loss_clip": 0.01105721, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_clip": 1.00247788, + "balance_loss_mlp": 1.00142217, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 2.027033127238524, + "language_loss": 0.8668015, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88953424, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 2.71478533744812 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01167726, + "balance_loss_clip": 1.00221777, + "balance_loss_mlp": 1.00131011, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 1.9671595187308104, + "language_loss": 0.80851483, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83201844, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.579210042953491 + }, + { + "auxiliary_loss_clip": 0.01182668, + "auxiliary_loss_mlp": 0.01167891, + "balance_loss_clip": 1.00227833, + "balance_loss_mlp": 1.00137901, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.3129978309247012, + "language_loss": 0.74784631, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77135193, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 2.4705569744110107 + }, + { + "auxiliary_loss_clip": 0.01134267, + "auxiliary_loss_mlp": 0.01167012, + "balance_loss_clip": 1.00214398, + "balance_loss_mlp": 1.00097728, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.2077405369139904, + "language_loss": 0.8247956, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84780836, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.6826670169830322 + }, + { + "auxiliary_loss_clip": 0.01165981, + "auxiliary_loss_mlp": 0.01167326, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00119579, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6194293600135907, + "language_loss": 0.82682705, + "learning_rate": 3.896166529529008e-06, + "loss": 0.85016012, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.537379026412964 + }, + { + "auxiliary_loss_clip": 0.01153611, + "auxiliary_loss_mlp": 0.01167928, + "balance_loss_clip": 1.00237489, + "balance_loss_mlp": 1.00141621, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.729644004220615, + "language_loss": 0.82565409, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84886944, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.6490423679351807 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01167741, + "balance_loss_clip": 1.0019244, + "balance_loss_mlp": 1.00151563, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.0803977308397354, + "language_loss": 0.72746652, + "learning_rate": 3.895918670803968e-06, + "loss": 0.750471, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.583726644515991 + }, + { + "auxiliary_loss_clip": 0.01182679, + "auxiliary_loss_mlp": 0.00749278, + "balance_loss_clip": 1.00225711, + "balance_loss_mlp": 1.00071633, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.451914993689833, + "language_loss": 0.81681353, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83613312, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.5222904682159424 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00116324, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.0665594857453824, + "language_loss": 0.71740544, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74025768, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.7267165184020996 + }, + { + "auxiliary_loss_clip": 0.01104639, + "auxiliary_loss_mlp": 0.0116752, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00091267, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.1105506791855326, + "language_loss": 0.75078094, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77350247, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.7195992469787598 + }, + { + "auxiliary_loss_clip": 0.01182608, + "auxiliary_loss_mlp": 0.01167254, + "balance_loss_clip": 1.00232697, + "balance_loss_mlp": 1.00112414, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.6202241262300017, + "language_loss": 0.83331335, + "learning_rate": 3.895422090670421e-06, + "loss": 0.856812, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.5717971324920654 + }, + { + "auxiliary_loss_clip": 0.01118067, + "auxiliary_loss_mlp": 0.01167906, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00129867, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 2.1942284728722994, + "language_loss": 0.83405435, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85691404, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.6690115928649902 + }, + { + "auxiliary_loss_clip": 0.01085396, + "auxiliary_loss_mlp": 0.01167822, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00150156, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.3742328802827313, + "language_loss": 0.80350286, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82603502, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.725472927093506 + }, + { + "auxiliary_loss_clip": 0.01182842, + "auxiliary_loss_mlp": 0.01167446, + "balance_loss_clip": 1.00247598, + "balance_loss_mlp": 1.00103009, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 2.101646012338437, + "language_loss": 0.6647855, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68828839, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.563910961151123 + }, + { + "auxiliary_loss_clip": 0.01149356, + "auxiliary_loss_mlp": 0.01167646, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00122976, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.9010877618710953, + "language_loss": 0.67787898, + "learning_rate": 3.8949243605434e-06, + "loss": 0.70104897, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.650489330291748 + }, + { + "auxiliary_loss_clip": 0.01166092, + "auxiliary_loss_mlp": 0.01167555, + "balance_loss_clip": 1.00231194, + "balance_loss_mlp": 1.00132918, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 2.276695654710587, + "language_loss": 0.72186255, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74519897, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 4.019665241241455 + }, + { + "auxiliary_loss_clip": 0.01133652, + "auxiliary_loss_mlp": 0.01167043, + "balance_loss_clip": 1.00224686, + "balance_loss_mlp": 1.00110388, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8896653097824643, + "language_loss": 0.7566765, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77968347, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.5965416431427 + }, + { + "auxiliary_loss_clip": 0.01134342, + "auxiliary_loss_mlp": 0.01167695, + "balance_loss_clip": 1.00250077, + "balance_loss_mlp": 1.00118315, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.561736555444961, + "language_loss": 0.706393, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72941339, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 4.050187587738037 + }, + { + "auxiliary_loss_clip": 0.01147964, + "auxiliary_loss_mlp": 0.01164009, + "balance_loss_clip": 1.00239801, + "balance_loss_mlp": 1.00150335, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8002596827704052, + "language_loss": 0.58978683, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61290652, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.283783435821533 + }, + { + "auxiliary_loss_clip": 0.01166877, + "auxiliary_loss_mlp": 0.01167406, + "balance_loss_clip": 1.00230455, + "balance_loss_mlp": 1.00118089, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 1.9691060251262253, + "language_loss": 0.80046958, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82381243, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.5353262424468994 + }, + { + "auxiliary_loss_clip": 0.01182575, + "auxiliary_loss_mlp": 0.01167215, + "balance_loss_clip": 1.00230598, + "balance_loss_mlp": 1.0010854, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8298107926497424, + "language_loss": 0.74813998, + "learning_rate": 3.894175609775881e-06, + "loss": 0.77163792, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 4.030194997787476 + }, + { + "auxiliary_loss_clip": 0.01132801, + "auxiliary_loss_mlp": 0.01167162, + "balance_loss_clip": 1.00219536, + "balance_loss_mlp": 1.00112677, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.7509836534092487, + "language_loss": 0.82464075, + "learning_rate": 3.894050566558015e-06, + "loss": 0.8476404, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 4.000895738601685 + }, + { + "auxiliary_loss_clip": 0.01182844, + "auxiliary_loss_mlp": 0.01167594, + "balance_loss_clip": 1.00248122, + "balance_loss_mlp": 1.00117779, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.28560932929148, + "language_loss": 0.74778569, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77129006, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.5523860454559326 + }, + { + "auxiliary_loss_clip": 0.01133172, + "auxiliary_loss_mlp": 0.01166969, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00112534, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.619508507799722, + "language_loss": 0.84313452, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86613595, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.630180835723877 + }, + { + "auxiliary_loss_clip": 0.0116636, + "auxiliary_loss_mlp": 0.01167545, + "balance_loss_clip": 1.00246155, + "balance_loss_mlp": 1.00141454, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.129580173690581, + "language_loss": 0.9007659, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92410499, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.544689178466797 + }, + { + "auxiliary_loss_clip": 0.01166988, + "auxiliary_loss_mlp": 0.0116725, + "balance_loss_clip": 1.00239253, + "balance_loss_mlp": 1.00111961, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 2.41621689465607, + "language_loss": 0.68526381, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70860624, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.5529582500457764 + }, + { + "auxiliary_loss_clip": 0.01133614, + "auxiliary_loss_mlp": 0.01167218, + "balance_loss_clip": 1.00211036, + "balance_loss_mlp": 1.00137401, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.417023769387833, + "language_loss": 0.78929985, + "learning_rate": 3.893424273224806e-06, + "loss": 0.81230819, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.615478515625 + }, + { + "auxiliary_loss_clip": 0.01182503, + "auxiliary_loss_mlp": 0.01166684, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00093555, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.0816389636963026, + "language_loss": 0.86079264, + "learning_rate": 3.893298799142636e-06, + "loss": 0.8842845, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.512993097305298 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01167313, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00099242, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 4.242850597090671, + "language_loss": 0.82617676, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84917575, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.624378204345703 + }, + { + "auxiliary_loss_clip": 0.01149599, + "auxiliary_loss_mlp": 0.01167228, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00119305, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 1.9025037624385943, + "language_loss": 0.72938979, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75255811, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.550291061401367 + }, + { + "auxiliary_loss_clip": 0.01165625, + "auxiliary_loss_mlp": 0.01166881, + "balance_loss_clip": 1.00231326, + "balance_loss_mlp": 1.00103736, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.3683404055811668, + "language_loss": 0.80325449, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82657957, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.545154333114624 + }, + { + "auxiliary_loss_clip": 0.01119827, + "auxiliary_loss_mlp": 0.01162643, + "balance_loss_clip": 1.00232649, + "balance_loss_mlp": 1.00089979, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8616267045013821, + "language_loss": 0.59008253, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61290729, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.247217893600464 + }, + { + "auxiliary_loss_clip": 0.01086139, + "auxiliary_loss_mlp": 0.01167008, + "balance_loss_clip": 1.00218379, + "balance_loss_mlp": 1.00125909, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.6867699804575957, + "language_loss": 0.74119139, + "learning_rate": 3.892670351915842e-06, + "loss": 0.7637229, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.7290585041046143 + }, + { + "auxiliary_loss_clip": 0.01170438, + "auxiliary_loss_mlp": 0.01167208, + "balance_loss_clip": 1.00275993, + "balance_loss_mlp": 1.00117314, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 2.026363088902424, + "language_loss": 0.73085725, + "learning_rate": 3.892544447140657e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.5563981533050537 + }, + { + "auxiliary_loss_clip": 0.01165707, + "auxiliary_loss_mlp": 0.01167423, + "balance_loss_clip": 1.00238907, + "balance_loss_mlp": 1.00157905, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 1.896689629108319, + "language_loss": 0.74410355, + "learning_rate": 3.892418470599996e-06, + "loss": 0.7674349, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.542402744293213 + }, + { + "auxiliary_loss_clip": 0.01134859, + "auxiliary_loss_mlp": 0.01166994, + "balance_loss_clip": 1.00238562, + "balance_loss_mlp": 1.00095963, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 1.96474672184523, + "language_loss": 0.79301846, + "learning_rate": 3.892292422298637e-06, + "loss": 0.816037, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.6167147159576416 + }, + { + "auxiliary_loss_clip": 0.01117498, + "auxiliary_loss_mlp": 0.01166848, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00100374, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9816269582273576, + "language_loss": 0.85495591, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87779939, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 2.6382369995117188 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.011629, + "balance_loss_clip": 1.00426209, + "balance_loss_mlp": 1.00039411, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7586869825002479, + "language_loss": 0.54168785, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56481767, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.130459785461426 + }, + { + "auxiliary_loss_clip": 0.01182471, + "auxiliary_loss_mlp": 0.01167013, + "balance_loss_clip": 1.00238287, + "balance_loss_mlp": 1.00116873, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.9220309042808796, + "language_loss": 0.72016495, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74365979, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.5465681552886963 + }, + { + "auxiliary_loss_clip": 0.01132564, + "auxiliary_loss_mlp": 0.00749237, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00074255, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.8231425199347522, + "language_loss": 0.78096271, + "learning_rate": 3.891787511581859e-06, + "loss": 0.79978073, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.6233885288238525 + }, + { + "auxiliary_loss_clip": 0.01166813, + "auxiliary_loss_mlp": 0.01166828, + "balance_loss_clip": 1.00227427, + "balance_loss_mlp": 1.00088882, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 4.7949022935060075, + "language_loss": 0.74943507, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77277148, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.5474517345428467 + }, + { + "auxiliary_loss_clip": 0.01182529, + "auxiliary_loss_mlp": 0.01166804, + "balance_loss_clip": 1.00231028, + "balance_loss_mlp": 1.00076962, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.115862125416548, + "language_loss": 0.79929954, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82279289, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.4759202003479004 + }, + { + "auxiliary_loss_clip": 0.01182506, + "auxiliary_loss_mlp": 0.01166758, + "balance_loss_clip": 1.00238431, + "balance_loss_mlp": 1.00139093, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 3.0326398763330262, + "language_loss": 0.82652861, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85002124, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.484262228012085 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01166885, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00104082, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.819303105936899, + "language_loss": 0.69237316, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71522033, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.7589523792266846 + }, + { + "auxiliary_loss_clip": 0.01182338, + "auxiliary_loss_mlp": 0.01166577, + "balance_loss_clip": 1.00239313, + "balance_loss_mlp": 1.00130546, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.8662553756420097, + "language_loss": 0.84695196, + "learning_rate": 3.891154759144557e-06, + "loss": 0.87044114, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.5281412601470947 + }, + { + "auxiliary_loss_clip": 0.01182461, + "auxiliary_loss_mlp": 0.0116661, + "balance_loss_clip": 1.00233328, + "balance_loss_mlp": 1.00086117, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.9820284746856647, + "language_loss": 0.87128812, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89477885, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.53048038482666 + }, + { + "auxiliary_loss_clip": 0.01149171, + "auxiliary_loss_mlp": 0.01166867, + "balance_loss_clip": 1.00227165, + "balance_loss_mlp": 1.00121427, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.406277673203889, + "language_loss": 0.72479177, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74795222, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.634648561477661 + }, + { + "auxiliary_loss_clip": 0.01117814, + "auxiliary_loss_mlp": 0.01166794, + "balance_loss_clip": 1.00207055, + "balance_loss_mlp": 1.00123608, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.1265932253715816, + "language_loss": 0.7373935, + "learning_rate": 3.890774247090444e-06, + "loss": 0.7602396, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.7044007778167725 + }, + { + "auxiliary_loss_clip": 0.01170426, + "auxiliary_loss_mlp": 0.01167027, + "balance_loss_clip": 1.00297785, + "balance_loss_mlp": 1.00108719, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 2.1012024850302207, + "language_loss": 0.78698713, + "learning_rate": 3.89064726633596e-06, + "loss": 0.81036162, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.6119956970214844 + }, + { + "auxiliary_loss_clip": 0.01133618, + "auxiliary_loss_mlp": 0.01166651, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00099766, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 5.118618476489406, + "language_loss": 0.7940473, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81704998, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.5994739532470703 + }, + { + "auxiliary_loss_clip": 0.01133367, + "auxiliary_loss_mlp": 0.01166416, + "balance_loss_clip": 1.00216317, + "balance_loss_mlp": 1.00095344, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.104033052571263, + "language_loss": 0.74331224, + "learning_rate": 3.890393089751208e-06, + "loss": 0.7663101, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.57441782951355 + }, + { + "auxiliary_loss_clip": 0.01149881, + "auxiliary_loss_mlp": 0.01166371, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.00090826, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 3.552786092689632, + "language_loss": 0.84103668, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86419922, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.5993924140930176 + }, + { + "auxiliary_loss_clip": 0.01166082, + "auxiliary_loss_mlp": 0.01166472, + "balance_loss_clip": 1.00241685, + "balance_loss_mlp": 1.00120068, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9556776660736805, + "language_loss": 0.85546088, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87878644, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.5672261714935303 + }, + { + "auxiliary_loss_clip": 0.01154655, + "auxiliary_loss_mlp": 0.00749092, + "balance_loss_clip": 1.00275981, + "balance_loss_mlp": 1.00081706, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.4877247254730506, + "language_loss": 0.82465041, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84368789, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 4.044506311416626 + }, + { + "auxiliary_loss_clip": 0.01132861, + "auxiliary_loss_mlp": 0.00748879, + "balance_loss_clip": 1.00366879, + "balance_loss_mlp": 1.00040746, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7572550652148273, + "language_loss": 0.58029008, + "learning_rate": 3.889883876413563e-06, + "loss": 0.59910738, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.3258323669433594 + }, + { + "auxiliary_loss_clip": 0.01148874, + "auxiliary_loss_mlp": 0.0116219, + "balance_loss_clip": 1.0025425, + "balance_loss_mlp": 1.00044656, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7942717202724816, + "language_loss": 0.55357206, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57668269, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 4.6569294929504395 + }, + { + "auxiliary_loss_clip": 0.01133824, + "auxiliary_loss_mlp": 0.0116621, + "balance_loss_clip": 1.0021565, + "balance_loss_mlp": 1.00093794, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 3.3577433627884523, + "language_loss": 0.73715693, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76015723, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.616499662399292 + }, + { + "auxiliary_loss_clip": 0.01135194, + "auxiliary_loss_mlp": 0.01165941, + "balance_loss_clip": 1.0022949, + "balance_loss_mlp": 1.00095546, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.858858827652947, + "language_loss": 0.79538709, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81839836, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.6336381435394287 + }, + { + "auxiliary_loss_clip": 0.01150048, + "auxiliary_loss_mlp": 0.01166466, + "balance_loss_clip": 1.00226152, + "balance_loss_mlp": 1.00109911, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.793303917692946, + "language_loss": 0.6935966, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71676171, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 4.068655014038086 + }, + { + "auxiliary_loss_clip": 0.01166683, + "auxiliary_loss_mlp": 0.01166275, + "balance_loss_clip": 1.00235868, + "balance_loss_mlp": 1.00100327, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 1.7664662338958994, + "language_loss": 0.81464553, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83797514, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 4.075867652893066 + }, + { + "auxiliary_loss_clip": 0.0116579, + "auxiliary_loss_mlp": 0.01166529, + "balance_loss_clip": 1.00225639, + "balance_loss_mlp": 1.0012573, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.8013688034997517, + "language_loss": 0.8727268, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89605004, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.5231969356536865 + }, + { + "auxiliary_loss_clip": 0.01149544, + "auxiliary_loss_mlp": 0.01166304, + "balance_loss_clip": 1.00235558, + "balance_loss_mlp": 1.00112724, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6810521942255967, + "language_loss": 0.73172355, + "learning_rate": 3.888989994172501e-06, + "loss": 0.7548821, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.6245546340942383 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01165911, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.0008297, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.8333500256401327, + "language_loss": 0.87204891, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89487803, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.7015907764434814 + }, + { + "auxiliary_loss_clip": 0.01133592, + "auxiliary_loss_mlp": 0.01166568, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00091505, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5498751385690637, + "language_loss": 0.77416849, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79717004, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.6510250568389893 + }, + { + "auxiliary_loss_clip": 0.01149756, + "auxiliary_loss_mlp": 0.01166708, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00124598, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 5.590696435440435, + "language_loss": 0.7837925, + "learning_rate": 3.888605827226212e-06, + "loss": 0.80695713, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.552304983139038 + }, + { + "auxiliary_loss_clip": 0.01163965, + "auxiliary_loss_mlp": 0.01162119, + "balance_loss_clip": 1.002406, + "balance_loss_mlp": 1.00037563, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9668121793766618, + "language_loss": 0.68952501, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71278584, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 2.980919122695923 + }, + { + "auxiliary_loss_clip": 0.01132296, + "auxiliary_loss_mlp": 0.01166473, + "balance_loss_clip": 1.00202644, + "balance_loss_mlp": 1.00120103, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 2.100758496120043, + "language_loss": 0.67644459, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69943225, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.713418960571289 + }, + { + "auxiliary_loss_clip": 0.01165345, + "auxiliary_loss_mlp": 0.01167143, + "balance_loss_clip": 1.00226831, + "balance_loss_mlp": 1.0014894, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 6.57344853319538, + "language_loss": 0.82619411, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84951895, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.5311625003814697 + }, + { + "auxiliary_loss_clip": 0.01182354, + "auxiliary_loss_mlp": 0.01166488, + "balance_loss_clip": 1.0023104, + "balance_loss_mlp": 1.00112057, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0040598705502912, + "language_loss": 0.66193539, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68542379, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.5074925422668457 + }, + { + "auxiliary_loss_clip": 0.01166743, + "auxiliary_loss_mlp": 0.01166184, + "balance_loss_clip": 1.00237083, + "balance_loss_mlp": 1.00100756, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2451932083244763, + "language_loss": 0.89289284, + "learning_rate": 3.887964116724835e-06, + "loss": 0.9162221, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.5503087043762207 + }, + { + "auxiliary_loss_clip": 0.01149817, + "auxiliary_loss_mlp": 0.01166523, + "balance_loss_clip": 1.00223815, + "balance_loss_mlp": 1.00115573, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.58519594332068, + "language_loss": 0.73679948, + "learning_rate": 3.887835559829712e-06, + "loss": 0.75996286, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.6425065994262695 + }, + { + "auxiliary_loss_clip": 0.01165692, + "auxiliary_loss_mlp": 0.01166222, + "balance_loss_clip": 1.00226128, + "balance_loss_mlp": 1.00095057, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 5.807335235421301, + "language_loss": 0.85525239, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87857151, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.52130126953125 + }, + { + "auxiliary_loss_clip": 0.01149512, + "auxiliary_loss_mlp": 0.01166008, + "balance_loss_clip": 1.00235128, + "balance_loss_mlp": 1.00092685, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 1.890069999228036, + "language_loss": 0.81212091, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83527613, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.569282054901123 + }, + { + "auxiliary_loss_clip": 0.01116721, + "auxiliary_loss_mlp": 0.01166809, + "balance_loss_clip": 1.00207484, + "balance_loss_mlp": 1.00144196, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.3880251666849235, + "language_loss": 0.74252057, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76535583, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.6996490955352783 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.01166619, + "balance_loss_clip": 1.00202405, + "balance_loss_mlp": 1.00115693, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 1.8066347964136718, + "language_loss": 0.80262202, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82561314, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.608628988265991 + }, + { + "auxiliary_loss_clip": 0.01117825, + "auxiliary_loss_mlp": 0.01166945, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00167298, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.6172017681402544, + "language_loss": 0.72162688, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74447465, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.712779998779297 + }, + { + "auxiliary_loss_clip": 0.01133574, + "auxiliary_loss_mlp": 0.01166778, + "balance_loss_clip": 1.00233483, + "balance_loss_mlp": 1.00112462, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 5.940080086481546, + "language_loss": 0.65775907, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.68076265, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 2.6334269046783447 + }, + { + "auxiliary_loss_clip": 0.01182191, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00094295, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.4300111621084803, + "language_loss": 0.82346153, + "learning_rate": 3.886933657403615e-06, + "loss": 0.84694374, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.480619192123413 + }, + { + "auxiliary_loss_clip": 0.01149687, + "auxiliary_loss_mlp": 0.01166369, + "balance_loss_clip": 1.00210464, + "balance_loss_mlp": 1.00119293, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 2.270256467436793, + "language_loss": 0.82306302, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84622347, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.616183042526245 + }, + { + "auxiliary_loss_clip": 0.01165839, + "auxiliary_loss_mlp": 0.01166646, + "balance_loss_clip": 1.00214052, + "balance_loss_mlp": 1.00108838, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6798842364445477, + "language_loss": 0.87070966, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.8940345, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.577218770980835 + }, + { + "auxiliary_loss_clip": 0.01182208, + "auxiliary_loss_mlp": 0.01166539, + "balance_loss_clip": 1.0022583, + "balance_loss_mlp": 1.00126672, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5570845042324353, + "language_loss": 0.7715137, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79500115, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.499166965484619 + }, + { + "auxiliary_loss_clip": 0.01150275, + "auxiliary_loss_mlp": 0.01166332, + "balance_loss_clip": 1.00218332, + "balance_loss_mlp": 1.00096512, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.9469433774917233, + "language_loss": 0.7857998, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80896592, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.5645575523376465 + }, + { + "auxiliary_loss_clip": 0.01149201, + "auxiliary_loss_mlp": 0.01166017, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00103116, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 3.191331948631098, + "language_loss": 0.68774867, + "learning_rate": 3.886287294705924e-06, + "loss": 0.7109009, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.639011859893799 + }, + { + "auxiliary_loss_clip": 0.01153495, + "auxiliary_loss_mlp": 0.01166239, + "balance_loss_clip": 1.0029031, + "balance_loss_mlp": 1.00106299, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.437896112911371, + "language_loss": 0.81663799, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83983529, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.5365774631500244 + }, + { + "auxiliary_loss_clip": 0.01117576, + "auxiliary_loss_mlp": 0.01166325, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00105345, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 2.6648209723059835, + "language_loss": 0.77793276, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80077177, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.6737334728240967 + }, + { + "auxiliary_loss_clip": 0.01182368, + "auxiliary_loss_mlp": 0.01166122, + "balance_loss_clip": 1.0025084, + "balance_loss_mlp": 1.00085032, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.6803763169718864, + "language_loss": 0.83505857, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85854352, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.513460636138916 + }, + { + "auxiliary_loss_clip": 0.01182282, + "auxiliary_loss_mlp": 0.01166569, + "balance_loss_clip": 1.00236261, + "balance_loss_mlp": 1.00120234, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.23488402214909, + "language_loss": 0.64715701, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67064548, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.5885064601898193 + }, + { + "auxiliary_loss_clip": 0.011496, + "auxiliary_loss_mlp": 0.01165675, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00107098, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.4405737157106469, + "language_loss": 0.7299701, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.75312281, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.613525152206421 + }, + { + "auxiliary_loss_clip": 0.01165167, + "auxiliary_loss_mlp": 0.01166377, + "balance_loss_clip": 1.00220585, + "balance_loss_mlp": 1.0012958, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.9008212515760203, + "language_loss": 0.86337125, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88668668, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.5290145874023438 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01165897, + "balance_loss_clip": 1.00227189, + "balance_loss_mlp": 1.00110197, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.625718727115277, + "language_loss": 0.79282701, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81614536, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.518996000289917 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01160992, + "balance_loss_clip": 1.00228345, + "balance_loss_mlp": 1.00001168, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7502971639673851, + "language_loss": 0.60520226, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62782139, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 4.8538713455200195 + }, + { + "auxiliary_loss_clip": 0.01165906, + "auxiliary_loss_mlp": 0.01166603, + "balance_loss_clip": 1.00249898, + "balance_loss_mlp": 1.00104499, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 1.888152810396915, + "language_loss": 0.81058776, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83391285, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 3.346479892730713 + }, + { + "auxiliary_loss_clip": 0.01149889, + "auxiliary_loss_mlp": 0.01165773, + "balance_loss_clip": 1.00212848, + "balance_loss_mlp": 1.00088263, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.154107025672589, + "language_loss": 0.77538991, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79854655, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 4.0618226528167725 + }, + { + "auxiliary_loss_clip": 0.01132225, + "auxiliary_loss_mlp": 0.01165735, + "balance_loss_clip": 1.00210369, + "balance_loss_mlp": 1.0011307, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.3986410647632195, + "language_loss": 0.84445572, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86743534, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.705789566040039 + }, + { + "auxiliary_loss_clip": 0.01165629, + "auxiliary_loss_mlp": 0.01166257, + "balance_loss_clip": 1.00220978, + "balance_loss_mlp": 1.00108087, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.977434524439992, + "language_loss": 0.82207835, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84539723, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.5375068187713623 + }, + { + "auxiliary_loss_clip": 0.01182106, + "auxiliary_loss_mlp": 0.01166234, + "balance_loss_clip": 1.00218439, + "balance_loss_mlp": 1.0012486, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.852441634290852, + "language_loss": 0.86080265, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88428605, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.491178274154663 + }, + { + "auxiliary_loss_clip": 0.01164383, + "auxiliary_loss_mlp": 0.01160325, + "balance_loss_clip": 1.00263333, + "balance_loss_mlp": 1.00010729, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7857005032992904, + "language_loss": 0.61777806, + "learning_rate": 3.884467967864485e-06, + "loss": 0.64102507, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 4.726408004760742 + }, + { + "auxiliary_loss_clip": 0.01165327, + "auxiliary_loss_mlp": 0.01166442, + "balance_loss_clip": 1.00234127, + "balance_loss_mlp": 1.00136113, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.743352677575639, + "language_loss": 0.89639926, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91971695, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.5591654777526855 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01166506, + "balance_loss_clip": 1.00218904, + "balance_loss_mlp": 1.00113893, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.257730109797822, + "language_loss": 0.84839404, + "learning_rate": 3.884206920366591e-06, + "loss": 0.87156475, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.569622755050659 + }, + { + "auxiliary_loss_clip": 0.0118213, + "auxiliary_loss_mlp": 0.01166328, + "balance_loss_clip": 1.00229955, + "balance_loss_mlp": 1.00105667, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.4784615148468436, + "language_loss": 0.75106287, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77454734, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.5181126594543457 + }, + { + "auxiliary_loss_clip": 0.01132406, + "auxiliary_loss_mlp": 0.01166147, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00106621, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.4436744083690187, + "language_loss": 0.83279955, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85578507, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.5822274684906006 + }, + { + "auxiliary_loss_clip": 0.01148953, + "auxiliary_loss_mlp": 0.00749347, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00114679, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 3.546185368697521, + "language_loss": 0.81994915, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83893216, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.5336737632751465 + }, + { + "auxiliary_loss_clip": 0.01165831, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_clip": 1.00224304, + "balance_loss_mlp": 1.0012114, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.484082864612142, + "language_loss": 0.82650709, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84982926, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.4999821186065674 + }, + { + "auxiliary_loss_clip": 0.01133128, + "auxiliary_loss_mlp": 0.01166064, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00145984, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.9977355801950187, + "language_loss": 0.73756111, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.760553, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.652240514755249 + }, + { + "auxiliary_loss_clip": 0.0114975, + "auxiliary_loss_mlp": 0.01166229, + "balance_loss_clip": 1.00225496, + "balance_loss_mlp": 1.00124335, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 3.169496921977095, + "language_loss": 0.74776751, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77092731, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.6013872623443604 + }, + { + "auxiliary_loss_clip": 0.01182068, + "auxiliary_loss_mlp": 0.01165923, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00122309, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 2.666791875513534, + "language_loss": 0.63357008, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65705001, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.621608257293701 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01166124, + "balance_loss_clip": 1.00216031, + "balance_loss_mlp": 1.00104332, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 3.8213743377424714, + "language_loss": 0.82808965, + "learning_rate": 3.883159872799043e-06, + "loss": 0.85124803, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.5777604579925537 + }, + { + "auxiliary_loss_clip": 0.01098943, + "auxiliary_loss_mlp": 0.01166544, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00136805, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 2.1366397538017856, + "language_loss": 0.88035953, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.9030143, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.8030545711517334 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01166238, + "balance_loss_clip": 1.00211203, + "balance_loss_mlp": 1.00106144, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 6.553447663086879, + "language_loss": 0.71699566, + "learning_rate": 3.882897396711683e-06, + "loss": 0.74030936, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.673065185546875 + }, + { + "auxiliary_loss_clip": 0.01116116, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00124574, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.5686228271143903, + "language_loss": 0.66885269, + "learning_rate": 3.882766051566027e-06, + "loss": 0.69167423, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.7244417667388916 + }, + { + "auxiliary_loss_clip": 0.01116067, + "auxiliary_loss_mlp": 0.0116622, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00113881, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 2.091911314793556, + "language_loss": 0.76761949, + "learning_rate": 3.882634635025694e-06, + "loss": 0.79044241, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.6998417377471924 + }, + { + "auxiliary_loss_clip": 0.01149625, + "auxiliary_loss_mlp": 0.01166282, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00110579, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 1.9074265474922167, + "language_loss": 0.82090914, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84406817, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.56699275970459 + }, + { + "auxiliary_loss_clip": 0.01165587, + "auxiliary_loss_mlp": 0.01166151, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.00106978, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.245721860866737, + "language_loss": 0.76542497, + "learning_rate": 3.882371587780931e-06, + "loss": 0.7887423, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.6155753135681152 + }, + { + "auxiliary_loss_clip": 0.01133711, + "auxiliary_loss_mlp": 0.01166303, + "balance_loss_clip": 1.00224662, + "balance_loss_mlp": 1.00103152, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 1.8272718627427058, + "language_loss": 0.81023788, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83323801, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.6127138137817383 + }, + { + "auxiliary_loss_clip": 0.01149335, + "auxiliary_loss_mlp": 0.01166257, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00136662, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 3.1888489853418367, + "language_loss": 0.75727427, + "learning_rate": 3.882108255017295e-06, + "loss": 0.78043014, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.5225718021392822 + }, + { + "auxiliary_loss_clip": 0.01165923, + "auxiliary_loss_mlp": 0.01166339, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00144839, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 1.8800974025167276, + "language_loss": 0.80914295, + "learning_rate": 3.881976481578379e-06, + "loss": 0.83246559, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01164131, + "auxiliary_loss_mlp": 0.01160316, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00009918, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.691348134853453, + "language_loss": 0.60672033, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62996483, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.2127108573913574 + }, + { + "auxiliary_loss_clip": 0.01181972, + "auxiliary_loss_mlp": 0.00749232, + "balance_loss_clip": 1.00224566, + "balance_loss_mlp": 1.00093126, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.6177806660433196, + "language_loss": 0.77758515, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79689717, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.580301284790039 + }, + { + "auxiliary_loss_clip": 0.01166278, + "auxiliary_loss_mlp": 0.01165753, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00105393, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.4907537249201255, + "language_loss": 0.78752232, + "learning_rate": 3.881580733093211e-06, + "loss": 0.81084263, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.5780344009399414 + }, + { + "auxiliary_loss_clip": 0.01165253, + "auxiliary_loss_mlp": 0.01165724, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00092912, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.2544699995667754, + "language_loss": 0.81124419, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83455396, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.5051848888397217 + }, + { + "auxiliary_loss_clip": 0.01166345, + "auxiliary_loss_mlp": 0.01166674, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00130689, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.6285592007100935, + "language_loss": 0.69586074, + "learning_rate": 3.881316544012779e-06, + "loss": 0.7191909, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.587001085281372 + }, + { + "auxiliary_loss_clip": 0.01166469, + "auxiliary_loss_mlp": 0.00749271, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00101626, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 6.325063887445644, + "language_loss": 0.8056891, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82484651, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.55843186378479 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01166299, + "balance_loss_clip": 1.00247133, + "balance_loss_mlp": 1.00131369, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.058157789878862, + "language_loss": 0.74774528, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77106756, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.672400951385498 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01166487, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00140631, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 5.376364171527477, + "language_loss": 0.76588988, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78855026, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.7695634365081787 + }, + { + "auxiliary_loss_clip": 0.01116045, + "auxiliary_loss_mlp": 0.01165679, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.0008837, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.761519999802632, + "language_loss": 0.79857123, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82138848, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.727548599243164 + }, + { + "auxiliary_loss_clip": 0.01182223, + "auxiliary_loss_mlp": 0.01166561, + "balance_loss_clip": 1.00235677, + "balance_loss_mlp": 1.00128937, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.7390090552059791, + "language_loss": 0.83628792, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85977572, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.477865695953369 + }, + { + "auxiliary_loss_clip": 0.01149185, + "auxiliary_loss_mlp": 0.01165913, + "balance_loss_clip": 1.00211513, + "balance_loss_mlp": 1.00092757, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.670979222818592, + "language_loss": 0.7350837, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75823468, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.5877997875213623 + }, + { + "auxiliary_loss_clip": 0.0116627, + "auxiliary_loss_mlp": 0.01166258, + "balance_loss_clip": 1.00241303, + "balance_loss_mlp": 1.00136757, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 1.9663222411736065, + "language_loss": 0.84914702, + "learning_rate": 3.880389635293729e-06, + "loss": 0.87247229, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 4.034441232681274 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01166312, + "balance_loss_clip": 1.002051, + "balance_loss_mlp": 1.00132608, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 2.567524172676072, + "language_loss": 0.75229526, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77544558, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.63045072555542 + }, + { + "auxiliary_loss_clip": 0.01148849, + "auxiliary_loss_mlp": 0.01166074, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00127959, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 2.0610252876522575, + "language_loss": 0.7464782, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76962745, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.617936849594116 + }, + { + "auxiliary_loss_clip": 0.0111643, + "auxiliary_loss_mlp": 0.01166075, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.00089872, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.088246697811136, + "language_loss": 0.86245507, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88528013, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 4.098492860794067 + }, + { + "auxiliary_loss_clip": 0.0113281, + "auxiliary_loss_mlp": 0.01166035, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00114512, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.099899851167332, + "language_loss": 0.68536592, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70835435, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.746243953704834 + }, + { + "auxiliary_loss_clip": 0.01116696, + "auxiliary_loss_mlp": 0.01165979, + "balance_loss_clip": 1.00229049, + "balance_loss_mlp": 1.00127912, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 2.21493183864034, + "language_loss": 0.87198842, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89481521, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.6799094676971436 + }, + { + "auxiliary_loss_clip": 0.01149101, + "auxiliary_loss_mlp": 0.00749308, + "balance_loss_clip": 1.00220978, + "balance_loss_mlp": 1.00117862, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 2.141700255203196, + "language_loss": 0.746378, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76536208, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 5.527067422866821 + }, + { + "auxiliary_loss_clip": 0.0113036, + "auxiliary_loss_mlp": 0.01160382, + "balance_loss_clip": 1.00217307, + "balance_loss_mlp": 1.00016451, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.6985856448894467, + "language_loss": 0.51645517, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53936267, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.2483644485473633 + }, + { + "auxiliary_loss_clip": 0.01166289, + "auxiliary_loss_mlp": 0.01166208, + "balance_loss_clip": 1.00225425, + "balance_loss_mlp": 1.00122285, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.0832467633427845, + "language_loss": 0.71472907, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73805404, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.5800528526306152 + }, + { + "auxiliary_loss_clip": 0.01165942, + "auxiliary_loss_mlp": 0.01166327, + "balance_loss_clip": 1.0023694, + "balance_loss_mlp": 1.0013411, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.3092193390877633, + "language_loss": 0.80012619, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82344884, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.536726713180542 + }, + { + "auxiliary_loss_clip": 0.01165465, + "auxiliary_loss_mlp": 0.0116577, + "balance_loss_clip": 1.0022589, + "balance_loss_mlp": 1.00087965, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.3976612985574994, + "language_loss": 0.78796065, + "learning_rate": 3.879059419522011e-06, + "loss": 0.81127304, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.602318048477173 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01165535, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00140738, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.0032381697850488, + "language_loss": 0.80133033, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82431334, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.6228249073028564 + }, + { + "auxiliary_loss_clip": 0.01165252, + "auxiliary_loss_mlp": 0.01165782, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.00108194, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.832209527246935, + "language_loss": 0.78250754, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80581784, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.546053886413574 + }, + { + "auxiliary_loss_clip": 0.01166424, + "auxiliary_loss_mlp": 0.01165858, + "balance_loss_clip": 1.00237083, + "balance_loss_mlp": 1.00125432, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.9727066047996478, + "language_loss": 0.78659868, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.8099215, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.560731887817383 + }, + { + "auxiliary_loss_clip": 0.01117093, + "auxiliary_loss_mlp": 0.01165347, + "balance_loss_clip": 1.00201929, + "balance_loss_mlp": 1.00093317, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.374639932737228, + "language_loss": 0.68788314, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71070755, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.683807611465454 + }, + { + "auxiliary_loss_clip": 0.01149977, + "auxiliary_loss_mlp": 0.01165738, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00094283, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 2.0251828798155764, + "language_loss": 0.86756527, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89072239, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.5681419372558594 + }, + { + "auxiliary_loss_clip": 0.01182019, + "auxiliary_loss_mlp": 0.0116575, + "balance_loss_clip": 1.0023402, + "balance_loss_mlp": 1.00124121, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 1.7358911024192598, + "language_loss": 0.75521839, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77869606, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.543372631072998 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01165649, + "balance_loss_clip": 1.00217187, + "balance_loss_mlp": 1.00123572, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.472678757068815, + "language_loss": 0.82642657, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84940851, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.6763463020324707 + }, + { + "auxiliary_loss_clip": 0.01148996, + "auxiliary_loss_mlp": 0.00749273, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00100994, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.281123969528552, + "language_loss": 0.86321414, + "learning_rate": 3.877990116366466e-06, + "loss": 0.88219678, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.6401288509368896 + }, + { + "auxiliary_loss_clip": 0.01163448, + "auxiliary_loss_mlp": 0.01159769, + "balance_loss_clip": 1.00228071, + "balance_loss_mlp": 1.00031435, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.753881168315147, + "language_loss": 0.6567136, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67994571, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.2630488872528076 + }, + { + "auxiliary_loss_clip": 0.01165207, + "auxiliary_loss_mlp": 0.0116487, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00074303, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.1614397529731093, + "language_loss": 0.78637016, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80967093, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.5170445442199707 + }, + { + "auxiliary_loss_clip": 0.01165339, + "auxiliary_loss_mlp": 0.01165725, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00102592, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.7675461566267507, + "language_loss": 0.77920473, + "learning_rate": 3.877587952519672e-06, + "loss": 0.80251539, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.544408082962036 + }, + { + "auxiliary_loss_clip": 0.01085301, + "auxiliary_loss_mlp": 0.01165143, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00101602, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.7715793781507616, + "language_loss": 0.876809, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89931345, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.722707748413086 + }, + { + "auxiliary_loss_clip": 0.01180568, + "auxiliary_loss_mlp": 0.01160567, + "balance_loss_clip": 1.00250626, + "balance_loss_mlp": 1.00034952, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8851250526978415, + "language_loss": 0.5906772, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61408854, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.13623309135437 + }, + { + "auxiliary_loss_clip": 0.01181952, + "auxiliary_loss_mlp": 0.00749286, + "balance_loss_clip": 1.00225091, + "balance_loss_mlp": 1.00101435, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 2.015545121571375, + "language_loss": 0.79751515, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81682754, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.514556884765625 + }, + { + "auxiliary_loss_clip": 0.01132808, + "auxiliary_loss_mlp": 0.01165424, + "balance_loss_clip": 1.00219321, + "balance_loss_mlp": 1.00101066, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.04295206407953, + "language_loss": 0.78417504, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80715728, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.606553077697754 + }, + { + "auxiliary_loss_clip": 0.01133961, + "auxiliary_loss_mlp": 0.0116589, + "balance_loss_clip": 1.00212145, + "balance_loss_mlp": 1.00100005, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 2.2333825019182494, + "language_loss": 0.6834653, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70646381, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.596886157989502 + }, + { + "auxiliary_loss_clip": 0.01181947, + "auxiliary_loss_mlp": 0.01165378, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.00106001, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 2.111386834460731, + "language_loss": 0.83753973, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.861013, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.454611301422119 + }, + { + "auxiliary_loss_clip": 0.01182042, + "auxiliary_loss_mlp": 0.01165584, + "balance_loss_clip": 1.00235021, + "balance_loss_mlp": 1.0009793, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.037580062519963, + "language_loss": 0.81824982, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84172606, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.539217233657837 + }, + { + "auxiliary_loss_clip": 0.01131949, + "auxiliary_loss_mlp": 0.00749271, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00111842, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 1.7792717780290284, + "language_loss": 0.87074196, + "learning_rate": 3.876512383242215e-06, + "loss": 0.8895542, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.688622236251831 + }, + { + "auxiliary_loss_clip": 0.01181951, + "auxiliary_loss_mlp": 0.01165316, + "balance_loss_clip": 1.00243831, + "balance_loss_mlp": 1.00109315, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 1.8404800987430388, + "language_loss": 0.80323672, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82670939, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.5511507987976074 + }, + { + "auxiliary_loss_clip": 0.01132434, + "auxiliary_loss_mlp": 0.01165562, + "balance_loss_clip": 1.00209808, + "balance_loss_mlp": 1.00105286, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 3.2832149699194404, + "language_loss": 0.85818297, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88116288, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.594261884689331 + }, + { + "auxiliary_loss_clip": 0.01166182, + "auxiliary_loss_mlp": 0.01166017, + "balance_loss_clip": 1.00229156, + "balance_loss_mlp": 1.00122237, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.0477292909061684, + "language_loss": 0.77858067, + "learning_rate": 3.876107870523477e-06, + "loss": 0.80190265, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.539332866668701 + }, + { + "auxiliary_loss_clip": 0.01181899, + "auxiliary_loss_mlp": 0.00749281, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00117218, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6386733918012766, + "language_loss": 0.77338809, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79269993, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.487213373184204 + }, + { + "auxiliary_loss_clip": 0.01149135, + "auxiliary_loss_mlp": 0.01165515, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.0010066, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 1.9405568713999708, + "language_loss": 0.80570364, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82885015, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.604912519454956 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01160003, + "balance_loss_clip": 1.00237775, + "balance_loss_mlp": 1.00054836, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8648015687763172, + "language_loss": 0.59059715, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61371356, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.185183048248291 + }, + { + "auxiliary_loss_clip": 0.01134095, + "auxiliary_loss_mlp": 0.01165565, + "balance_loss_clip": 1.002249, + "balance_loss_mlp": 1.00096047, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.255238988572192, + "language_loss": 0.65364665, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.7341341972351074 + }, + { + "auxiliary_loss_clip": 0.01117446, + "auxiliary_loss_mlp": 0.01165127, + "balance_loss_clip": 1.00223732, + "balance_loss_mlp": 1.0009048, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.5698314514743823, + "language_loss": 0.70817316, + "learning_rate": 3.875432259883256e-06, + "loss": 0.73099887, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.6999285221099854 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.01165971, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.001176, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.7284771502498197, + "language_loss": 0.85956562, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88256043, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 4.161855220794678 + }, + { + "auxiliary_loss_clip": 0.01133922, + "auxiliary_loss_mlp": 0.01164583, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00102758, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.8364929959069203, + "language_loss": 0.67203367, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69501877, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.7496726512908936 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01165708, + "balance_loss_clip": 1.0021044, + "balance_loss_mlp": 1.00100863, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.152894670856293, + "language_loss": 0.89216888, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91515875, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 4.011184930801392 + }, + { + "auxiliary_loss_clip": 0.01165052, + "auxiliary_loss_mlp": 0.01165377, + "balance_loss_clip": 1.00208664, + "balance_loss_mlp": 1.00105894, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.854523729604531, + "language_loss": 0.71210325, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.73540747, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.527406692504883 + }, + { + "auxiliary_loss_clip": 0.0114988, + "auxiliary_loss_mlp": 0.00749162, + "balance_loss_clip": 1.00242829, + "balance_loss_mlp": 1.00104392, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 2.6404413985887527, + "language_loss": 0.81993413, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83892459, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.5992228984832764 + }, + { + "auxiliary_loss_clip": 0.01164993, + "auxiliary_loss_mlp": 0.01164965, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.00102854, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.9566237704850133, + "language_loss": 0.89231443, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91561401, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 3.9986495971679688 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.01165208, + "balance_loss_clip": 1.00213432, + "balance_loss_mlp": 1.00117576, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.3625063449108508, + "language_loss": 0.85107672, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87405372, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.611936092376709 + }, + { + "auxiliary_loss_clip": 0.01165065, + "auxiliary_loss_mlp": 0.01165175, + "balance_loss_clip": 1.00217378, + "balance_loss_mlp": 1.00104725, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.6287723862411452, + "language_loss": 0.74161786, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76492023, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.5868380069732666 + }, + { + "auxiliary_loss_clip": 0.01165064, + "auxiliary_loss_mlp": 0.0116536, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00113773, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.3322707519213686, + "language_loss": 0.78196657, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80527085, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.5324437618255615 + }, + { + "auxiliary_loss_clip": 0.01149785, + "auxiliary_loss_mlp": 0.01165054, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00102162, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 3.006061179713102, + "language_loss": 0.72237062, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74551904, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.54099178314209 + }, + { + "auxiliary_loss_clip": 0.01181585, + "auxiliary_loss_mlp": 0.01165246, + "balance_loss_clip": 1.00236881, + "balance_loss_mlp": 1.00130975, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 2.453777446470461, + "language_loss": 0.72740388, + "learning_rate": 3.873939659120557e-06, + "loss": 0.75087225, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.4644150733947754 + }, + { + "auxiliary_loss_clip": 0.01163601, + "auxiliary_loss_mlp": 0.011596, + "balance_loss_clip": 1.00239301, + "balance_loss_mlp": 1.00014591, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8284304500636707, + "language_loss": 0.56033379, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58356583, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 2.987299919128418 + }, + { + "auxiliary_loss_clip": 0.01131658, + "auxiliary_loss_mlp": 0.01164886, + "balance_loss_clip": 1.00202549, + "balance_loss_mlp": 1.00094962, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.668727160573173, + "language_loss": 0.82795691, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85092235, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.01132494, + "auxiliary_loss_mlp": 0.01164517, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00086617, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.7009712697274837, + "language_loss": 0.81651908, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83948922, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.656681776046753 + }, + { + "auxiliary_loss_clip": 0.01116708, + "auxiliary_loss_mlp": 0.01165327, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00110483, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 2.0331861296496387, + "language_loss": 0.82351255, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84633291, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.658233880996704 + }, + { + "auxiliary_loss_clip": 0.01165393, + "auxiliary_loss_mlp": 0.01165258, + "balance_loss_clip": 1.00239754, + "balance_loss_mlp": 1.00132132, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.7726869282501359, + "language_loss": 0.80593014, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82923663, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.550713539123535 + }, + { + "auxiliary_loss_clip": 0.01164902, + "auxiliary_loss_mlp": 0.0116491, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00087833, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 1.9085998116719018, + "language_loss": 0.79708934, + "learning_rate": 3.873121888753442e-06, + "loss": 0.82038748, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.5347208976745605 + }, + { + "auxiliary_loss_clip": 0.01165194, + "auxiliary_loss_mlp": 0.01165154, + "balance_loss_clip": 1.00242829, + "balance_loss_mlp": 1.00093126, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.0972629834361927, + "language_loss": 0.79933321, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82263672, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.5557680130004883 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_clip": 1.00221646, + "balance_loss_mlp": 1.00103283, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.6638092041620647, + "language_loss": 0.65701044, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67967731, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.8584086894989014 + }, + { + "auxiliary_loss_clip": 0.01165011, + "auxiliary_loss_mlp": 0.01164672, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.0010215, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.466105698486453, + "language_loss": 0.7884146, + "learning_rate": 3.87271204460899e-06, + "loss": 0.81171143, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.545279026031494 + }, + { + "auxiliary_loss_clip": 0.01181668, + "auxiliary_loss_mlp": 0.01164906, + "balance_loss_clip": 1.00233519, + "balance_loss_mlp": 1.00106466, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.1147048206257675, + "language_loss": 0.80380809, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82727396, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.4789960384368896 + }, + { + "auxiliary_loss_clip": 0.01164663, + "auxiliary_loss_mlp": 0.01164905, + "balance_loss_clip": 1.00230622, + "balance_loss_mlp": 1.00106335, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 2.8698278799022066, + "language_loss": 0.77907032, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80236602, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.5531105995178223 + }, + { + "auxiliary_loss_clip": 0.01147568, + "auxiliary_loss_mlp": 0.01159515, + "balance_loss_clip": 1.00225329, + "balance_loss_mlp": 1.00006092, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8527407166845581, + "language_loss": 0.61537719, + "learning_rate": 3.872301561343699e-06, + "loss": 0.638448, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.094461679458618 + }, + { + "auxiliary_loss_clip": 0.01165449, + "auxiliary_loss_mlp": 0.01164719, + "balance_loss_clip": 1.00220251, + "balance_loss_mlp": 1.0009737, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.5294428721132294, + "language_loss": 0.64427131, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66757298, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.5562896728515625 + }, + { + "auxiliary_loss_clip": 0.01165861, + "auxiliary_loss_mlp": 0.01164948, + "balance_loss_clip": 1.00217974, + "balance_loss_mlp": 1.00062954, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.5338947042701174, + "language_loss": 0.74014294, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.76345104, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.560072660446167 + }, + { + "auxiliary_loss_clip": 0.01164966, + "auxiliary_loss_mlp": 0.01164766, + "balance_loss_clip": 1.00217319, + "balance_loss_mlp": 1.00082982, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.9941167666336317, + "language_loss": 0.77477914, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79807639, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.555318593978882 + }, + { + "auxiliary_loss_clip": 0.01181619, + "auxiliary_loss_mlp": 0.01164948, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.00120246, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 3.239069514749594, + "language_loss": 0.77108961, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79455525, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.5350019931793213 + }, + { + "auxiliary_loss_clip": 0.01165873, + "auxiliary_loss_mlp": 0.01164512, + "balance_loss_clip": 1.00230813, + "balance_loss_mlp": 1.00095713, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.6747668327653207, + "language_loss": 0.86518711, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88849092, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.4983768463134766 + }, + { + "auxiliary_loss_clip": 0.01164658, + "auxiliary_loss_mlp": 0.01165034, + "balance_loss_clip": 1.0022682, + "balance_loss_mlp": 1.00100183, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.6622131962075355, + "language_loss": 0.88983428, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91313124, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 2.591721773147583 + }, + { + "auxiliary_loss_clip": 0.01149242, + "auxiliary_loss_mlp": 0.01164778, + "balance_loss_clip": 1.00229216, + "balance_loss_mlp": 1.00093627, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8413636528852808, + "language_loss": 0.81337118, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83651137, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.564690351486206 + }, + { + "auxiliary_loss_clip": 0.01165969, + "auxiliary_loss_mlp": 0.01165075, + "balance_loss_clip": 1.00225556, + "balance_loss_mlp": 1.00094795, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 3.497365361197833, + "language_loss": 0.83703959, + "learning_rate": 3.871203815778219e-06, + "loss": 0.86035001, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.5868282318115234 + }, + { + "auxiliary_loss_clip": 0.01164076, + "auxiliary_loss_mlp": 0.01159759, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00030422, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9083573200663486, + "language_loss": 0.61941898, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64265734, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 3.0824527740478516 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01164929, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00118268, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.9624564536678675, + "language_loss": 0.87029946, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89342749, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.5712413787841797 + }, + { + "auxiliary_loss_clip": 0.01131647, + "auxiliary_loss_mlp": 0.01164934, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00090194, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9496505728563036, + "language_loss": 0.75266325, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77562904, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.5978288650512695 + }, + { + "auxiliary_loss_clip": 0.01164, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_clip": 1.00222635, + "balance_loss_mlp": 1.00085449, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6814472021356699, + "language_loss": 0.5184629, + "learning_rate": 3.870653239879212e-06, + "loss": 0.5416984, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 3.0290722846984863 + }, + { + "auxiliary_loss_clip": 0.0118145, + "auxiliary_loss_mlp": 0.01164749, + "balance_loss_clip": 1.00232661, + "balance_loss_mlp": 1.00119376, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 2.277671453150357, + "language_loss": 0.70365751, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72711951, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01133896, + "auxiliary_loss_mlp": 0.01165232, + "balance_loss_clip": 1.00222743, + "balance_loss_mlp": 1.00119996, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8405468849046622, + "language_loss": 0.82151735, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84450859, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.608684539794922 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01164848, + "balance_loss_clip": 1.0021168, + "balance_loss_mlp": 1.00100636, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 2.030320494645102, + "language_loss": 0.72222495, + "learning_rate": 3.870239563115436e-06, + "loss": 0.7453568, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.592297315597534 + }, + { + "auxiliary_loss_clip": 0.01119716, + "auxiliary_loss_mlp": 0.00749052, + "balance_loss_clip": 1.00284135, + "balance_loss_mlp": 1.0007422, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 61.551009983853284, + "language_loss": 0.75787914, + "learning_rate": 3.870101529014526e-06, + "loss": 0.7765668, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 4.228186130523682 + }, + { + "auxiliary_loss_clip": 0.01116088, + "auxiliary_loss_mlp": 0.01164358, + "balance_loss_clip": 1.00207222, + "balance_loss_mlp": 1.00080287, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.2981902658242226, + "language_loss": 0.82096934, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84377384, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.6376826763153076 + }, + { + "auxiliary_loss_clip": 0.01164808, + "auxiliary_loss_mlp": 0.01165215, + "balance_loss_clip": 1.0022043, + "balance_loss_mlp": 1.00127792, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.173644497861676, + "language_loss": 0.74180722, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76510751, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.6218602657318115 + }, + { + "auxiliary_loss_clip": 0.01164747, + "auxiliary_loss_mlp": 0.01164566, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00139201, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 1.6229961858030069, + "language_loss": 0.74159443, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76488757, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 4.115020036697388 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.0116472, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00116444, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8823166947851246, + "language_loss": 0.73356855, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75655591, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.69333553314209 + }, + { + "auxiliary_loss_clip": 0.01149225, + "auxiliary_loss_mlp": 0.01164827, + "balance_loss_clip": 1.00229096, + "balance_loss_mlp": 1.00127232, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 2.6888930358668603, + "language_loss": 0.91025203, + "learning_rate": 3.869410294898195e-06, + "loss": 0.93339258, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 4.118345022201538 + }, + { + "auxiliary_loss_clip": 0.01150271, + "auxiliary_loss_mlp": 0.0116497, + "balance_loss_clip": 1.00231171, + "balance_loss_mlp": 1.00122416, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.6383670936408254, + "language_loss": 0.65165329, + "learning_rate": 3.869271835389268e-06, + "loss": 0.6748057, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.6146762371063232 + }, + { + "auxiliary_loss_clip": 0.01153479, + "auxiliary_loss_mlp": 0.01164565, + "balance_loss_clip": 1.00291395, + "balance_loss_mlp": 1.00110483, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 1.965030794477659, + "language_loss": 0.80537587, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.8285563, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.5521440505981445 + }, + { + "auxiliary_loss_clip": 0.01150217, + "auxiliary_loss_mlp": 0.0116516, + "balance_loss_clip": 1.00238693, + "balance_loss_mlp": 1.00122285, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.8187559124299009, + "language_loss": 0.82935762, + "learning_rate": 3.868994703727742e-06, + "loss": 0.85251141, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.6898574829101562 + }, + { + "auxiliary_loss_clip": 0.01133068, + "auxiliary_loss_mlp": 0.01164961, + "balance_loss_clip": 1.00214362, + "balance_loss_mlp": 1.00102425, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.445935461562888, + "language_loss": 0.87265217, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89563251, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.5993666648864746 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01165275, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00124288, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4980292649248244, + "language_loss": 0.75971889, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78269804, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.668513536453247 + }, + { + "auxiliary_loss_clip": 0.0116568, + "auxiliary_loss_mlp": 0.00749011, + "balance_loss_clip": 1.00225949, + "balance_loss_mlp": 1.00063968, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.613578150556455, + "language_loss": 0.8312403, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.5420501232147217 + }, + { + "auxiliary_loss_clip": 0.01181562, + "auxiliary_loss_mlp": 0.01164863, + "balance_loss_clip": 1.00238967, + "balance_loss_mlp": 1.00102198, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 2.0209861438700045, + "language_loss": 0.83143413, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85489839, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.457979440689087 + }, + { + "auxiliary_loss_clip": 0.01181536, + "auxiliary_loss_mlp": 0.0116488, + "balance_loss_clip": 1.00236058, + "balance_loss_mlp": 1.00113392, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.141127077632468, + "language_loss": 0.84577358, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86923766, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.486908197402954 + }, + { + "auxiliary_loss_clip": 0.01149242, + "auxiliary_loss_mlp": 0.01164812, + "balance_loss_clip": 1.00230861, + "balance_loss_mlp": 1.00135231, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 4.893526300514655, + "language_loss": 0.85875404, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88189459, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.549783945083618 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01164315, + "balance_loss_clip": 1.00231183, + "balance_loss_mlp": 1.00095057, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.953139399367803, + "language_loss": 0.79491061, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81820786, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.5980334281921387 + }, + { + "auxiliary_loss_clip": 0.011654, + "auxiliary_loss_mlp": 0.01165015, + "balance_loss_clip": 1.00232863, + "balance_loss_mlp": 1.00136411, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.406320918912757, + "language_loss": 0.77379966, + "learning_rate": 3.867883342604009e-06, + "loss": 0.79710376, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.5218024253845215 + }, + { + "auxiliary_loss_clip": 0.01165502, + "auxiliary_loss_mlp": 0.01164603, + "balance_loss_clip": 1.00239635, + "balance_loss_mlp": 1.00095248, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 1.6139932026849086, + "language_loss": 0.9319644, + "learning_rate": 3.867744103671717e-06, + "loss": 0.9552654, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.570159673690796 + }, + { + "auxiliary_loss_clip": 0.01149033, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_clip": 1.00215662, + "balance_loss_mlp": 1.00103378, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.7529350690624592, + "language_loss": 0.91394573, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93708193, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.5901412963867188 + }, + { + "auxiliary_loss_clip": 0.01164896, + "auxiliary_loss_mlp": 0.01164705, + "balance_loss_clip": 1.00227845, + "balance_loss_mlp": 1.00115025, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.6654206450369955, + "language_loss": 0.7403115, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76360756, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.5540430545806885 + }, + { + "auxiliary_loss_clip": 0.01131778, + "auxiliary_loss_mlp": 0.01164598, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.0012337, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.7562410156496928, + "language_loss": 0.78646564, + "learning_rate": 3.867325961945714e-06, + "loss": 0.80942941, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.598606824874878 + }, + { + "auxiliary_loss_clip": 0.01132725, + "auxiliary_loss_mlp": 0.01165291, + "balance_loss_clip": 1.00232983, + "balance_loss_mlp": 1.00125933, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.4913940286634757, + "language_loss": 0.88367844, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90665859, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.6130905151367188 + }, + { + "auxiliary_loss_clip": 0.01149228, + "auxiliary_loss_mlp": 0.01164491, + "balance_loss_clip": 1.00220585, + "balance_loss_mlp": 1.00103116, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.004671401168684, + "language_loss": 0.7701124, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79324955, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.5486648082733154 + }, + { + "auxiliary_loss_clip": 0.01131487, + "auxiliary_loss_mlp": 0.01164558, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00100279, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.281920667475284, + "language_loss": 0.77009571, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79305613, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.6573429107666016 + }, + { + "auxiliary_loss_clip": 0.01132849, + "auxiliary_loss_mlp": 0.01164919, + "balance_loss_clip": 1.00231147, + "balance_loss_mlp": 1.0011735, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 4.227025895158705, + "language_loss": 0.88464737, + "learning_rate": 3.866767448340471e-06, + "loss": 0.90762508, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.585906744003296 + }, + { + "auxiliary_loss_clip": 0.01164964, + "auxiliary_loss_mlp": 0.01164714, + "balance_loss_clip": 1.00230455, + "balance_loss_mlp": 1.00106359, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.7626396791385583, + "language_loss": 0.79572421, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81902099, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.51690411567688 + }, + { + "auxiliary_loss_clip": 0.01165829, + "auxiliary_loss_mlp": 0.01164051, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00106835, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.7790415326631315, + "language_loss": 0.75169563, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77499449, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.5862691402435303 + }, + { + "auxiliary_loss_clip": 0.01181516, + "auxiliary_loss_mlp": 0.01164369, + "balance_loss_clip": 1.00241876, + "balance_loss_mlp": 1.00090933, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5800029691849447, + "language_loss": 0.78829169, + "learning_rate": 3.866347819843925e-06, + "loss": 0.81175053, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.4868593215942383 + }, + { + "auxiliary_loss_clip": 0.01148432, + "auxiliary_loss_mlp": 0.01164658, + "balance_loss_clip": 1.00214088, + "balance_loss_mlp": 1.00110281, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.0644625788341315, + "language_loss": 0.82085729, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84398818, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.587618589401245 + }, + { + "auxiliary_loss_clip": 0.01166026, + "auxiliary_loss_mlp": 0.01164635, + "balance_loss_clip": 1.0025568, + "balance_loss_mlp": 1.00108027, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.6701600882612064, + "language_loss": 0.8236168, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84692341, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.578542709350586 + }, + { + "auxiliary_loss_clip": 0.01149058, + "auxiliary_loss_mlp": 0.01164948, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00129783, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.3733763428984913, + "language_loss": 0.83204681, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85518682, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.544092893600464 + }, + { + "auxiliary_loss_clip": 0.01165391, + "auxiliary_loss_mlp": 0.01164575, + "balance_loss_clip": 1.00254142, + "balance_loss_mlp": 1.00121117, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.7067885502692701, + "language_loss": 0.75030839, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77360803, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.585367441177368 + }, + { + "auxiliary_loss_clip": 0.01131013, + "auxiliary_loss_mlp": 0.01158759, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00006711, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8612610230173323, + "language_loss": 0.61796904, + "learning_rate": 3.865647023645277e-06, + "loss": 0.6408667, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.0724778175354004 + }, + { + "auxiliary_loss_clip": 0.01164837, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00121808, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.203249362737262, + "language_loss": 0.77128363, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79458064, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.5076751708984375 + }, + { + "auxiliary_loss_clip": 0.01181418, + "auxiliary_loss_mlp": 0.01164488, + "balance_loss_clip": 1.00231719, + "balance_loss_mlp": 1.00112414, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8191876934243607, + "language_loss": 0.76601565, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78947473, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.544332981109619 + }, + { + "auxiliary_loss_clip": 0.01181346, + "auxiliary_loss_mlp": 0.01164057, + "balance_loss_clip": 1.00230479, + "balance_loss_mlp": 1.00097883, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.614795317507341, + "language_loss": 0.85926342, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88271743, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.661525011062622 + }, + { + "auxiliary_loss_clip": 0.01137931, + "auxiliary_loss_mlp": 0.01164164, + "balance_loss_clip": 1.00260055, + "balance_loss_mlp": 1.00099015, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.6638140072728016, + "language_loss": 0.83300817, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85602909, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.602320671081543 + }, + { + "auxiliary_loss_clip": 0.01149986, + "auxiliary_loss_mlp": 0.00748963, + "balance_loss_clip": 1.0022949, + "balance_loss_mlp": 1.00074697, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.5182531685895406, + "language_loss": 0.83194685, + "learning_rate": 3.864944458808712e-06, + "loss": 0.85093641, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 4.125833988189697 + }, + { + "auxiliary_loss_clip": 0.01181473, + "auxiliary_loss_mlp": 0.01164278, + "balance_loss_clip": 1.00240803, + "balance_loss_mlp": 1.00091386, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.857609536854908, + "language_loss": 0.80231982, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82577729, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.4845378398895264 + }, + { + "auxiliary_loss_clip": 0.0116572, + "auxiliary_loss_mlp": 0.01164091, + "balance_loss_clip": 1.0024426, + "balance_loss_mlp": 1.00110781, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 3.5095703940471688, + "language_loss": 0.64991665, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67321473, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.5687057971954346 + }, + { + "auxiliary_loss_clip": 0.01150052, + "auxiliary_loss_mlp": 0.01164138, + "balance_loss_clip": 1.00227392, + "balance_loss_mlp": 1.00115502, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.6012428790068463, + "language_loss": 0.82223308, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84537494, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.575749635696411 + }, + { + "auxiliary_loss_clip": 0.01148739, + "auxiliary_loss_mlp": 0.01164176, + "balance_loss_clip": 1.00223386, + "balance_loss_mlp": 1.00119281, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.8631844395907724, + "language_loss": 0.74840701, + "learning_rate": 3.864381133967676e-06, + "loss": 0.77153611, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 4.030646085739136 + }, + { + "auxiliary_loss_clip": 0.01149203, + "auxiliary_loss_mlp": 0.01163842, + "balance_loss_clip": 1.00212884, + "balance_loss_mlp": 1.00095391, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7685026143947016, + "language_loss": 0.80807316, + "learning_rate": 3.86424012600026e-06, + "loss": 0.83120364, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.571594715118408 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01164269, + "balance_loss_clip": 1.00218368, + "balance_loss_mlp": 1.00100005, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.7178499287687505, + "language_loss": 0.844657, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86762875, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 4.017939567565918 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.00749082, + "balance_loss_clip": 1.00259709, + "balance_loss_mlp": 1.00077724, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 2.228224727363408, + "language_loss": 0.70622492, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72508472, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.646798849105835 + }, + { + "auxiliary_loss_clip": 0.01148828, + "auxiliary_loss_mlp": 0.01163715, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00101781, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.966702731296258, + "language_loss": 0.73263133, + "learning_rate": 3.863816677966381e-06, + "loss": 0.7557568, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.539766788482666 + }, + { + "auxiliary_loss_clip": 0.01117239, + "auxiliary_loss_mlp": 0.01163766, + "balance_loss_clip": 1.00221872, + "balance_loss_mlp": 1.00097382, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.5685924464802463, + "language_loss": 0.73381805, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75662804, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.6217942237854004 + }, + { + "auxiliary_loss_clip": 0.01164759, + "auxiliary_loss_mlp": 0.01164391, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00093079, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 2.4425215541956784, + "language_loss": 0.76031262, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.78360415, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.552345037460327 + }, + { + "auxiliary_loss_clip": 0.01181174, + "auxiliary_loss_mlp": 0.01163756, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00086832, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.5879985967973427, + "language_loss": 0.79393935, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81738859, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.4967200756073 + }, + { + "auxiliary_loss_clip": 0.01164655, + "auxiliary_loss_mlp": 0.01164143, + "balance_loss_clip": 1.00220561, + "balance_loss_mlp": 1.00106478, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1074053160125352, + "language_loss": 0.82648039, + "learning_rate": 3.863251091147299e-06, + "loss": 0.8497684, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.5291054248809814 + }, + { + "auxiliary_loss_clip": 0.01117486, + "auxiliary_loss_mlp": 0.01164364, + "balance_loss_clip": 1.00219679, + "balance_loss_mlp": 1.00099969, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.927963334416754, + "language_loss": 0.74831164, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7711302, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.751286268234253 + }, + { + "auxiliary_loss_clip": 0.01181481, + "auxiliary_loss_mlp": 0.01164255, + "balance_loss_clip": 1.00246811, + "balance_loss_mlp": 1.00098634, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.772278838657097, + "language_loss": 0.81854206, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.84199941, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.4644227027893066 + }, + { + "auxiliary_loss_clip": 0.01152745, + "auxiliary_loss_mlp": 0.01164132, + "balance_loss_clip": 1.00215387, + "balance_loss_mlp": 1.00105417, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 2.0030988268824297, + "language_loss": 0.70129019, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72445899, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.646038770675659 + }, + { + "auxiliary_loss_clip": 0.01165225, + "auxiliary_loss_mlp": 0.01164265, + "balance_loss_clip": 1.00252712, + "balance_loss_mlp": 1.00118697, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 1.7905819607445133, + "language_loss": 0.76927191, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79256684, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.5104446411132812 + }, + { + "auxiliary_loss_clip": 0.01162816, + "auxiliary_loss_mlp": 0.01158055, + "balance_loss_clip": 1.00222898, + "balance_loss_mlp": 1.00012672, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9030834814607999, + "language_loss": 0.58917546, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61238408, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.0670955181121826 + }, + { + "auxiliary_loss_clip": 0.01145658, + "auxiliary_loss_mlp": 0.01158021, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00009298, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8726655229949707, + "language_loss": 0.62190616, + "learning_rate": 3.862400591386154e-06, + "loss": 0.644943, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.1468777656555176 + }, + { + "auxiliary_loss_clip": 0.01164556, + "auxiliary_loss_mlp": 0.01163852, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00096428, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.995583551896268, + "language_loss": 0.72467887, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74796295, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.607447862625122 + }, + { + "auxiliary_loss_clip": 0.01131266, + "auxiliary_loss_mlp": 0.0115793, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00000155, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7107327608789289, + "language_loss": 0.60369289, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62658477, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.236630439758301 + }, + { + "auxiliary_loss_clip": 0.01181438, + "auxiliary_loss_mlp": 0.01164143, + "balance_loss_clip": 1.00235665, + "balance_loss_mlp": 1.00106454, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 3.1433523304142823, + "language_loss": 0.79159844, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81505424, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.5887746810913086 + }, + { + "auxiliary_loss_clip": 0.01134104, + "auxiliary_loss_mlp": 0.0116357, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.00087309, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8934112614213583, + "language_loss": 0.71370834, + "learning_rate": 3.861832179025394e-06, + "loss": 0.7366851, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.617894172668457 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_clip": 1.0024786, + "balance_loss_mlp": 1.00082445, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.5671512774319925, + "language_loss": 0.8989799, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92214465, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.580658197402954 + }, + { + "auxiliary_loss_clip": 0.01165167, + "auxiliary_loss_mlp": 0.01163738, + "balance_loss_clip": 1.00222611, + "balance_loss_mlp": 1.00094581, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 1.8539887807032327, + "language_loss": 0.83078104, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85407007, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.548081159591675 + }, + { + "auxiliary_loss_clip": 0.01117617, + "auxiliary_loss_mlp": 0.01163916, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00112426, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.5817713614980176, + "language_loss": 0.81938648, + "learning_rate": 3.861405128426914e-06, + "loss": 0.84220183, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.6461660861968994 + }, + { + "auxiliary_loss_clip": 0.01132245, + "auxiliary_loss_mlp": 0.00748509, + "balance_loss_clip": 1.00296962, + "balance_loss_mlp": 1.00025368, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.914645215821506, + "language_loss": 0.63360512, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65241265, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.2132372856140137 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.00749025, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.0008347, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.6640144178258645, + "language_loss": 0.82837176, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84701401, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.703021764755249 + }, + { + "auxiliary_loss_clip": 0.01147782, + "auxiliary_loss_mlp": 0.01163628, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00102639, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.7999566329081182, + "language_loss": 0.7885555, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81166959, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.559420108795166 + }, + { + "auxiliary_loss_clip": 0.01164644, + "auxiliary_loss_mlp": 0.01164052, + "balance_loss_clip": 1.00227189, + "balance_loss_mlp": 1.0011642, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4970167737703384, + "language_loss": 0.83446985, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85775679, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.5435903072357178 + }, + { + "auxiliary_loss_clip": 0.011814, + "auxiliary_loss_mlp": 0.01163582, + "balance_loss_clip": 1.00243819, + "balance_loss_mlp": 1.00117135, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.7318638569687215, + "language_loss": 0.87221503, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89566481, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.4944612979888916 + }, + { + "auxiliary_loss_clip": 0.01133929, + "auxiliary_loss_mlp": 0.01163946, + "balance_loss_clip": 1.00215161, + "balance_loss_mlp": 1.00105834, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9069412653313687, + "language_loss": 0.67024899, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69322777, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.584169864654541 + }, + { + "auxiliary_loss_clip": 0.01165582, + "auxiliary_loss_mlp": 0.01163424, + "balance_loss_clip": 1.00231135, + "balance_loss_mlp": 1.00091827, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7850999508098249, + "language_loss": 0.8364557, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85974574, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.5560967922210693 + }, + { + "auxiliary_loss_clip": 0.01132743, + "auxiliary_loss_mlp": 0.01163631, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.0011251, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.7105989941287472, + "language_loss": 0.78886509, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81182885, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.5927469730377197 + }, + { + "auxiliary_loss_clip": 0.01181608, + "auxiliary_loss_mlp": 0.01164253, + "balance_loss_clip": 1.00260556, + "balance_loss_mlp": 1.00117469, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.193354045761417, + "language_loss": 0.83119261, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85465121, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.516716480255127 + }, + { + "auxiliary_loss_clip": 0.01165787, + "auxiliary_loss_mlp": 0.01164088, + "balance_loss_clip": 1.00235927, + "balance_loss_mlp": 1.00110483, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.928881113211991, + "language_loss": 0.78918773, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81248653, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.541944742202759 + }, + { + "auxiliary_loss_clip": 0.0118132, + "auxiliary_loss_mlp": 0.00749162, + "balance_loss_clip": 1.0023334, + "balance_loss_mlp": 1.00087142, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.0726729929274814, + "language_loss": 0.79989982, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81920469, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.5341711044311523 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.011636, + "balance_loss_clip": 1.00241828, + "balance_loss_mlp": 1.00099826, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 8.884527932755326, + "language_loss": 0.78207397, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80503756, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.599384307861328 + }, + { + "auxiliary_loss_clip": 0.0113493, + "auxiliary_loss_mlp": 0.01157986, + "balance_loss_clip": 1.00231516, + "balance_loss_mlp": 1.00005758, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8444297506048881, + "language_loss": 0.58431005, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60723925, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 4.666586637496948 + }, + { + "auxiliary_loss_clip": 0.01181064, + "auxiliary_loss_mlp": 0.0116339, + "balance_loss_clip": 1.0022856, + "balance_loss_mlp": 1.00097919, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.5459098844752583, + "language_loss": 0.88271159, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90615618, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.485611915588379 + }, + { + "auxiliary_loss_clip": 0.01164657, + "auxiliary_loss_mlp": 0.00749089, + "balance_loss_clip": 1.00228906, + "balance_loss_mlp": 1.0009439, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.058236246526757, + "language_loss": 0.74808705, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76722455, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.55938982963562 + }, + { + "auxiliary_loss_clip": 0.01181229, + "auxiliary_loss_mlp": 0.01163681, + "balance_loss_clip": 1.00224948, + "balance_loss_mlp": 1.0008893, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 1.9548938375490414, + "language_loss": 0.73968828, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76313734, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 3.957622528076172 + }, + { + "auxiliary_loss_clip": 0.01164577, + "auxiliary_loss_mlp": 0.01163936, + "balance_loss_clip": 1.00237989, + "balance_loss_mlp": 1.00095344, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 5.27640008311336, + "language_loss": 0.74759042, + "learning_rate": 3.858973179936668e-06, + "loss": 0.77087557, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.556380033493042 + }, + { + "auxiliary_loss_clip": 0.01164686, + "auxiliary_loss_mlp": 0.01163889, + "balance_loss_clip": 1.0023222, + "balance_loss_mlp": 1.00119257, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 1.9447207674327929, + "language_loss": 0.7447294, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76801515, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 4.157334089279175 + }, + { + "auxiliary_loss_clip": 0.01181111, + "auxiliary_loss_mlp": 0.01163501, + "balance_loss_clip": 1.00234222, + "balance_loss_mlp": 1.00109076, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6351713085701538, + "language_loss": 0.83372033, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85716647, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 2.5550029277801514 + }, + { + "auxiliary_loss_clip": 0.0116524, + "auxiliary_loss_mlp": 0.01164129, + "balance_loss_clip": 1.00252759, + "balance_loss_mlp": 1.00095558, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 1.9659532105557944, + "language_loss": 0.72323149, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74652517, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.555945873260498 + }, + { + "auxiliary_loss_clip": 0.0113635, + "auxiliary_loss_mlp": 0.01164111, + "balance_loss_clip": 1.00241411, + "balance_loss_mlp": 1.00093782, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 4.094768161714525, + "language_loss": 0.81152433, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83452892, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.5418996810913086 + }, + { + "auxiliary_loss_clip": 0.01165462, + "auxiliary_loss_mlp": 0.01163747, + "balance_loss_clip": 1.0023483, + "balance_loss_mlp": 1.00095487, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.8307207652668784, + "language_loss": 0.83140141, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85469353, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.01148061, + "auxiliary_loss_mlp": 0.01163775, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00107837, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 2.003807861437796, + "language_loss": 0.71053779, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73365617, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.602012872695923 + }, + { + "auxiliary_loss_clip": 0.01179134, + "auxiliary_loss_mlp": 0.01157225, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00005996, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.818683223073965, + "language_loss": 0.63104188, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65440547, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 2.9774253368377686 + }, + { + "auxiliary_loss_clip": 0.01119425, + "auxiliary_loss_mlp": 0.01163547, + "balance_loss_clip": 1.00258636, + "balance_loss_mlp": 1.00104117, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.54877483683293, + "language_loss": 0.74929535, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77212507, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.6990091800689697 + }, + { + "auxiliary_loss_clip": 0.01181239, + "auxiliary_loss_mlp": 0.01163496, + "balance_loss_clip": 1.00241053, + "balance_loss_mlp": 1.0010848, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.5047520121311266, + "language_loss": 0.85857683, + "learning_rate": 3.857677428484242e-06, + "loss": 0.88202417, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.5512006282806396 + }, + { + "auxiliary_loss_clip": 0.01179051, + "auxiliary_loss_mlp": 0.01157178, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00001204, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7618205893304646, + "language_loss": 0.56816554, + "learning_rate": 3.857533103811195e-06, + "loss": 0.59152782, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.0093233585357666 + }, + { + "auxiliary_loss_clip": 0.01149735, + "auxiliary_loss_mlp": 0.01163296, + "balance_loss_clip": 1.00228739, + "balance_loss_mlp": 1.00098038, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.8759003024851104, + "language_loss": 0.85514611, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87827635, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.565049409866333 + }, + { + "auxiliary_loss_clip": 0.01165381, + "auxiliary_loss_mlp": 0.01163743, + "balance_loss_clip": 1.00229955, + "balance_loss_mlp": 1.00114202, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.074155920415174, + "language_loss": 0.74634564, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76963693, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.5151190757751465 + }, + { + "auxiliary_loss_clip": 0.01148658, + "auxiliary_loss_mlp": 0.01162873, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.00065327, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6491855141790182, + "language_loss": 0.82470512, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8478204, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.5963475704193115 + }, + { + "auxiliary_loss_clip": 0.01087107, + "auxiliary_loss_mlp": 0.01163347, + "balance_loss_clip": 1.00233614, + "balance_loss_mlp": 1.00084043, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.5163687821555671, + "language_loss": 0.74262691, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76513147, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.829070806503296 + }, + { + "auxiliary_loss_clip": 0.01147861, + "auxiliary_loss_mlp": 0.01163804, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00110745, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.5968150134210237, + "language_loss": 0.76164043, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78475708, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 3.094770669937134 + }, + { + "auxiliary_loss_clip": 0.01148642, + "auxiliary_loss_mlp": 0.0116347, + "balance_loss_clip": 1.00217438, + "balance_loss_mlp": 1.00096345, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 1.9654652316427492, + "language_loss": 0.83140326, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85452437, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.5875988006591797 + }, + { + "auxiliary_loss_clip": 0.01120804, + "auxiliary_loss_mlp": 0.01163785, + "balance_loss_clip": 1.00240016, + "balance_loss_mlp": 1.00108767, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 1.942868913520111, + "language_loss": 0.84274447, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86559033, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.7613656520843506 + }, + { + "auxiliary_loss_clip": 0.01169272, + "auxiliary_loss_mlp": 0.01163158, + "balance_loss_clip": 1.00276303, + "balance_loss_mlp": 1.00084233, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.9939801063993567, + "language_loss": 0.84776652, + "learning_rate": 3.856375971124805e-06, + "loss": 0.87109083, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.5707218647003174 + }, + { + "auxiliary_loss_clip": 0.01165416, + "auxiliary_loss_mlp": 0.01163012, + "balance_loss_clip": 1.00238848, + "balance_loss_mlp": 1.00088727, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.8352679094550453, + "language_loss": 0.75678194, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78006625, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.543170928955078 + }, + { + "auxiliary_loss_clip": 0.01116656, + "auxiliary_loss_mlp": 0.01163791, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00099862, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.776518730095667, + "language_loss": 0.8289271, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85173154, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.747929573059082 + }, + { + "auxiliary_loss_clip": 0.01148804, + "auxiliary_loss_mlp": 0.01163233, + "balance_loss_clip": 1.00234938, + "balance_loss_mlp": 1.00091791, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.4907888885737512, + "language_loss": 0.75923133, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78235173, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.604414463043213 + }, + { + "auxiliary_loss_clip": 0.01131447, + "auxiliary_loss_mlp": 0.01163929, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00104117, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.7018454477353537, + "language_loss": 0.81603897, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83899271, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.7186007499694824 + }, + { + "auxiliary_loss_clip": 0.01165587, + "auxiliary_loss_mlp": 0.01163783, + "balance_loss_clip": 1.00239408, + "balance_loss_mlp": 1.00118148, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.3177047314767476, + "language_loss": 0.6603235, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68361723, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.4953665733337402 + }, + { + "auxiliary_loss_clip": 0.01148005, + "auxiliary_loss_mlp": 0.01163664, + "balance_loss_clip": 1.00212944, + "balance_loss_mlp": 1.00106215, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 2.0630375208604437, + "language_loss": 0.6760335, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.6991502, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 2.878211736679077 + }, + { + "auxiliary_loss_clip": 0.01165092, + "auxiliary_loss_mlp": 0.01163354, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00113416, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.7322260922744293, + "language_loss": 0.76664543, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78992999, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.5550642013549805 + }, + { + "auxiliary_loss_clip": 0.01153767, + "auxiliary_loss_mlp": 0.01163126, + "balance_loss_clip": 1.0031842, + "balance_loss_mlp": 1.00100183, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.635994372139527, + "language_loss": 0.79861069, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82177961, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.5816519260406494 + }, + { + "auxiliary_loss_clip": 0.0118121, + "auxiliary_loss_mlp": 0.01163571, + "balance_loss_clip": 1.00237703, + "balance_loss_mlp": 1.00087428, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.7095374599043587, + "language_loss": 0.76575375, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78920162, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.521434783935547 + }, + { + "auxiliary_loss_clip": 0.01097372, + "auxiliary_loss_mlp": 0.01157379, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00021327, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7775819442218713, + "language_loss": 0.60053575, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62308335, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.4409849643707275 + }, + { + "auxiliary_loss_clip": 0.0114959, + "auxiliary_loss_mlp": 0.01163417, + "balance_loss_clip": 1.00222099, + "balance_loss_mlp": 1.0010066, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.4673236074987206, + "language_loss": 0.87724298, + "learning_rate": 3.85477755808841e-06, + "loss": 0.90037304, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 2.9870927333831787 + }, + { + "auxiliary_loss_clip": 0.01136556, + "auxiliary_loss_mlp": 0.01163645, + "balance_loss_clip": 1.00255132, + "balance_loss_mlp": 1.00085294, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 2.1077237257884343, + "language_loss": 0.75945133, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78245336, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.6428961753845215 + }, + { + "auxiliary_loss_clip": 0.01131662, + "auxiliary_loss_mlp": 0.01162996, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00087166, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.1976043130608396, + "language_loss": 0.75527281, + "learning_rate": 3.854486022987603e-06, + "loss": 0.77821934, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.5945544242858887 + }, + { + "auxiliary_loss_clip": 0.01180945, + "auxiliary_loss_mlp": 0.01163409, + "balance_loss_clip": 1.00228822, + "balance_loss_mlp": 1.00090337, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.7902296504541264, + "language_loss": 0.72604609, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74948955, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 2.5167396068573 + }, + { + "auxiliary_loss_clip": 0.01149097, + "auxiliary_loss_mlp": 0.01163499, + "balance_loss_clip": 1.00229371, + "balance_loss_mlp": 1.000898, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.044093820268022, + "language_loss": 0.89563441, + "learning_rate": 3.854194206597615e-06, + "loss": 0.9187603, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.584989070892334 + }, + { + "auxiliary_loss_clip": 0.011329, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_clip": 1.00221884, + "balance_loss_mlp": 1.00088632, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.389776989271599, + "language_loss": 0.80496359, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82792461, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 3.992246389389038 + }, + { + "auxiliary_loss_clip": 0.01164577, + "auxiliary_loss_mlp": 0.01163989, + "balance_loss_clip": 1.00222588, + "balance_loss_mlp": 1.00129211, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.6490504552034104, + "language_loss": 0.77778912, + "learning_rate": 3.853902108962709e-06, + "loss": 0.80107474, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01131683, + "auxiliary_loss_mlp": 0.0116364, + "balance_loss_clip": 1.00208294, + "balance_loss_mlp": 1.00122905, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.9970587789500422, + "language_loss": 0.82192332, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84487653, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.608369827270508 + }, + { + "auxiliary_loss_clip": 0.01116284, + "auxiliary_loss_mlp": 0.01163316, + "balance_loss_clip": 1.00236785, + "balance_loss_mlp": 1.00100088, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.7760805916757867, + "language_loss": 0.80730116, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83009708, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 4.022028923034668 + }, + { + "auxiliary_loss_clip": 0.01164905, + "auxiliary_loss_mlp": 0.01162978, + "balance_loss_clip": 1.00234604, + "balance_loss_mlp": 1.00094855, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.734656983287945, + "language_loss": 0.78069055, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80396938, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.6340739727020264 + }, + { + "auxiliary_loss_clip": 0.01147822, + "auxiliary_loss_mlp": 0.01157639, + "balance_loss_clip": 1.00240326, + "balance_loss_mlp": 1.00047338, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8116420476238302, + "language_loss": 0.6026969, + "learning_rate": 3.853317070135407e-06, + "loss": 0.6257515, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 4.742049932479858 + }, + { + "auxiliary_loss_clip": 0.01116261, + "auxiliary_loss_mlp": 0.0116321, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.0010854, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.0539003314148334, + "language_loss": 0.70783323, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73062789, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.6899187564849854 + }, + { + "auxiliary_loss_clip": 0.01147583, + "auxiliary_loss_mlp": 0.01163143, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00101829, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5913880868232357, + "language_loss": 0.81259423, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83570153, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.6031618118286133 + }, + { + "auxiliary_loss_clip": 0.01132475, + "auxiliary_loss_mlp": 0.01163579, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00135899, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.1425257588423285, + "language_loss": 0.84330565, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86626625, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.593456745147705 + }, + { + "auxiliary_loss_clip": 0.01164263, + "auxiliary_loss_mlp": 0.01163531, + "balance_loss_clip": 1.00208056, + "balance_loss_mlp": 1.00111985, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 2.1539970955701735, + "language_loss": 0.77185744, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79513538, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.52980899810791 + }, + { + "auxiliary_loss_clip": 0.01132233, + "auxiliary_loss_mlp": 0.01163627, + "balance_loss_clip": 1.00223672, + "balance_loss_mlp": 1.00092983, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.2914275062444847, + "language_loss": 0.78585947, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80881804, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.6088104248046875 + }, + { + "auxiliary_loss_clip": 0.01164799, + "auxiliary_loss_mlp": 0.00748894, + "balance_loss_clip": 1.00226951, + "balance_loss_mlp": 1.00071347, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.5296888132068085, + "language_loss": 0.7050485, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72418541, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.5590271949768066 + }, + { + "auxiliary_loss_clip": 0.01148641, + "auxiliary_loss_mlp": 0.00749123, + "balance_loss_clip": 1.00220084, + "balance_loss_mlp": 1.00079513, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.0723520638943005, + "language_loss": 0.84566641, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86464405, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.5849318504333496 + }, + { + "auxiliary_loss_clip": 0.01165571, + "auxiliary_loss_mlp": 0.0116356, + "balance_loss_clip": 1.00240684, + "balance_loss_mlp": 1.0011493, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 1.8793126264556457, + "language_loss": 0.84608817, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.86937946, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.5611631870269775 + }, + { + "auxiliary_loss_clip": 0.01164083, + "auxiliary_loss_mlp": 0.01163003, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00106883, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.2922049368395574, + "language_loss": 0.74639171, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76966256, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.508190631866455 + }, + { + "auxiliary_loss_clip": 0.01165457, + "auxiliary_loss_mlp": 0.01163146, + "balance_loss_clip": 1.00241637, + "balance_loss_mlp": 1.00102115, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 4.698735023180489, + "language_loss": 0.7229256, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74621165, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.6546902656555176 + }, + { + "auxiliary_loss_clip": 0.01147693, + "auxiliary_loss_mlp": 0.01163769, + "balance_loss_clip": 1.00208962, + "balance_loss_mlp": 1.00126243, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.473065521233709, + "language_loss": 0.7111389, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73425353, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.5185184478759766 + }, + { + "auxiliary_loss_clip": 0.01148794, + "auxiliary_loss_mlp": 0.01163737, + "balance_loss_clip": 1.00228739, + "balance_loss_mlp": 1.00094521, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 2.799055406618534, + "language_loss": 0.81871998, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.84184521, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.5258586406707764 + }, + { + "auxiliary_loss_clip": 0.01132826, + "auxiliary_loss_mlp": 0.01163926, + "balance_loss_clip": 1.0022893, + "balance_loss_mlp": 1.00113368, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.957169331940878, + "language_loss": 0.80074054, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82370806, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.7366788387298584 + }, + { + "auxiliary_loss_clip": 0.0116545, + "auxiliary_loss_mlp": 0.01163332, + "balance_loss_clip": 1.00229418, + "balance_loss_mlp": 1.00092161, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 1.7808470613318539, + "language_loss": 0.90999448, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93328226, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.5701217651367188 + }, + { + "auxiliary_loss_clip": 0.01164427, + "auxiliary_loss_mlp": 0.01163736, + "balance_loss_clip": 1.00226688, + "balance_loss_mlp": 1.00142097, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 3.025360181218193, + "language_loss": 0.78846049, + "learning_rate": 3.851113162828802e-06, + "loss": 0.81174207, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.4994475841522217 + }, + { + "auxiliary_loss_clip": 0.01164011, + "auxiliary_loss_mlp": 0.01163389, + "balance_loss_clip": 1.00225592, + "balance_loss_mlp": 1.0008831, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.6543992033668984, + "language_loss": 0.8032499, + "learning_rate": 3.85096567391148e-06, + "loss": 0.8265239, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.512441396713257 + }, + { + "auxiliary_loss_clip": 0.0115238, + "auxiliary_loss_mlp": 0.01163276, + "balance_loss_clip": 1.00246036, + "balance_loss_mlp": 1.0011512, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 2.535708056287732, + "language_loss": 0.6647464, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68790293, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 2.956902503967285 + }, + { + "auxiliary_loss_clip": 0.01162513, + "auxiliary_loss_mlp": 0.01156859, + "balance_loss_clip": 1.00235641, + "balance_loss_mlp": 1.00045669, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8843667245508475, + "language_loss": 0.59503359, + "learning_rate": 3.850670485516019e-06, + "loss": 0.6182273, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.134834051132202 + }, + { + "auxiliary_loss_clip": 0.01181084, + "auxiliary_loss_mlp": 0.01164161, + "balance_loss_clip": 1.00240421, + "balance_loss_mlp": 1.001369, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.1226581425065065, + "language_loss": 0.65756947, + "learning_rate": 3.850522786049075e-06, + "loss": 0.68102193, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.4843223094940186 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01163819, + "balance_loss_clip": 1.00249743, + "balance_loss_mlp": 1.00140858, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.8736325660989908, + "language_loss": 0.75337845, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77651513, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.6472606658935547 + }, + { + "auxiliary_loss_clip": 0.01131239, + "auxiliary_loss_mlp": 0.01163331, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.0007298, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.0570663757420697, + "language_loss": 0.72383672, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74678242, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.619990110397339 + }, + { + "auxiliary_loss_clip": 0.01148698, + "auxiliary_loss_mlp": 0.01163617, + "balance_loss_clip": 1.00227094, + "balance_loss_mlp": 1.00120664, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.9984611254547708, + "language_loss": 0.71922916, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74235237, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.6622889041900635 + }, + { + "auxiliary_loss_clip": 0.0113245, + "auxiliary_loss_mlp": 0.01163512, + "balance_loss_clip": 1.0021323, + "balance_loss_mlp": 1.00148296, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1.8474457249139973, + "language_loss": 0.65059173, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67355138, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.722391128540039 + }, + { + "auxiliary_loss_clip": 0.01147585, + "auxiliary_loss_mlp": 0.01163174, + "balance_loss_clip": 1.00214148, + "balance_loss_mlp": 1.00114489, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.2202270016544636, + "language_loss": 0.83587497, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85898256, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.532316207885742 + }, + { + "auxiliary_loss_clip": 0.01131612, + "auxiliary_loss_mlp": 0.01163378, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.00125313, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.8501228312794713, + "language_loss": 0.77637303, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79932296, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.5929462909698486 + }, + { + "auxiliary_loss_clip": 0.01180917, + "auxiliary_loss_mlp": 0.01162972, + "balance_loss_clip": 1.00239587, + "balance_loss_mlp": 1.00084782, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 1.8679070585692246, + "language_loss": 0.8544811, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87792003, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.5094223022460938 + }, + { + "auxiliary_loss_clip": 0.01163962, + "auxiliary_loss_mlp": 0.01163084, + "balance_loss_clip": 1.00228357, + "balance_loss_mlp": 1.00095892, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.5903792438396127, + "language_loss": 0.83098817, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85425866, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.545053482055664 + }, + { + "auxiliary_loss_clip": 0.01132967, + "auxiliary_loss_mlp": 0.01162826, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00089264, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 1.900350186329163, + "language_loss": 0.76622999, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78918797, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.5911195278167725 + }, + { + "auxiliary_loss_clip": 0.01181072, + "auxiliary_loss_mlp": 0.01163393, + "balance_loss_clip": 1.00235069, + "balance_loss_mlp": 1.00098181, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 2.614328955978997, + "language_loss": 0.76140058, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78484517, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.4654698371887207 + }, + { + "auxiliary_loss_clip": 0.01164084, + "auxiliary_loss_mlp": 0.01162899, + "balance_loss_clip": 1.0021174, + "balance_loss_mlp": 1.0012517, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 3.238920521900282, + "language_loss": 0.69249976, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71576953, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.527458906173706 + }, + { + "auxiliary_loss_clip": 0.01148957, + "auxiliary_loss_mlp": 0.01163524, + "balance_loss_clip": 1.00248146, + "balance_loss_mlp": 1.00111294, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 11.092108157079757, + "language_loss": 0.77889228, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80201709, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.5968120098114014 + }, + { + "auxiliary_loss_clip": 0.01148718, + "auxiliary_loss_mlp": 0.00749034, + "balance_loss_clip": 1.0023315, + "balance_loss_mlp": 1.00070798, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 3.362578681314684, + "language_loss": 0.80639154, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82536906, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 3.958566188812256 + }, + { + "auxiliary_loss_clip": 0.01164389, + "auxiliary_loss_mlp": 0.01163437, + "balance_loss_clip": 1.00228596, + "balance_loss_mlp": 1.00112164, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.1756713262088723, + "language_loss": 0.73958814, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76286644, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 2.540656089782715 + }, + { + "auxiliary_loss_clip": 0.01099404, + "auxiliary_loss_mlp": 0.01162788, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00104523, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.4804099042276673, + "language_loss": 0.69365346, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71627539, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.7304928302764893 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.01163134, + "balance_loss_clip": 1.00230217, + "balance_loss_mlp": 1.00100923, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.035284482757786, + "language_loss": 0.73812199, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76140422, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.6137704849243164 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01156557, + "balance_loss_clip": 1.00177228, + "balance_loss_mlp": 1.00015414, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8695952774838791, + "language_loss": 0.647219, + "learning_rate": 3.84800116337411e-06, + "loss": 0.66976464, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 4.594067811965942 + }, + { + "auxiliary_loss_clip": 0.01164166, + "auxiliary_loss_mlp": 0.01162718, + "balance_loss_clip": 1.00234389, + "balance_loss_mlp": 1.00087953, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 3.9195366163778877, + "language_loss": 0.73328102, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75654984, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.5406012535095215 + }, + { + "auxiliary_loss_clip": 0.01148595, + "auxiliary_loss_mlp": 0.01162969, + "balance_loss_clip": 1.00218213, + "balance_loss_mlp": 1.00113022, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 2.174310044828806, + "language_loss": 0.77606988, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79918551, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 4.29481053352356 + }, + { + "auxiliary_loss_clip": 0.01163361, + "auxiliary_loss_mlp": 0.01155872, + "balance_loss_clip": 1.00230217, + "balance_loss_mlp": 1.00023198, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7255509111384641, + "language_loss": 0.54688835, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.57008076, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.308539390563965 + }, + { + "auxiliary_loss_clip": 0.0113343, + "auxiliary_loss_mlp": 0.01163041, + "balance_loss_clip": 1.00202239, + "balance_loss_mlp": 1.00101209, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 1.9490848980009716, + "language_loss": 0.78968728, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.81265199, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.6366939544677734 + }, + { + "auxiliary_loss_clip": 0.01165427, + "auxiliary_loss_mlp": 0.0116298, + "balance_loss_clip": 1.0023309, + "balance_loss_mlp": 1.00085485, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 3.0261445320263842, + "language_loss": 0.7021898, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72547394, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.5813376903533936 + }, + { + "auxiliary_loss_clip": 0.01164265, + "auxiliary_loss_mlp": 0.01162853, + "balance_loss_clip": 1.00216615, + "balance_loss_mlp": 1.00091946, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 5.2063466879947615, + "language_loss": 0.7832728, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80654395, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.5248217582702637 + }, + { + "auxiliary_loss_clip": 0.01148451, + "auxiliary_loss_mlp": 0.0116312, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00118661, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 1.8466412645209382, + "language_loss": 0.7498228, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77293849, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.619309902191162 + }, + { + "auxiliary_loss_clip": 0.01148598, + "auxiliary_loss_mlp": 0.01163148, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00111914, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.247436764682917, + "language_loss": 0.81646252, + "learning_rate": 3.84680750808108e-06, + "loss": 0.83958, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.589306592941284 + }, + { + "auxiliary_loss_clip": 0.01113054, + "auxiliary_loss_mlp": 0.0115567, + "balance_loss_clip": 1.00163436, + "balance_loss_mlp": 1.00003052, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8220602520423945, + "language_loss": 0.57879394, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60148114, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.190603733062744 + }, + { + "auxiliary_loss_clip": 0.01164864, + "auxiliary_loss_mlp": 0.01162752, + "balance_loss_clip": 1.00229132, + "balance_loss_mlp": 1.00100851, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.7792006159068492, + "language_loss": 0.74761498, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77089107, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.6054506301879883 + }, + { + "auxiliary_loss_clip": 0.01148517, + "auxiliary_loss_mlp": 0.01163048, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00101924, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8141286800976921, + "language_loss": 0.74863458, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.77175021, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.582929849624634 + }, + { + "auxiliary_loss_clip": 0.01165207, + "auxiliary_loss_mlp": 0.01162632, + "balance_loss_clip": 1.00239336, + "balance_loss_mlp": 1.00088847, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.6053597110150113, + "language_loss": 0.79945135, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82272971, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.533628225326538 + }, + { + "auxiliary_loss_clip": 0.01147989, + "auxiliary_loss_mlp": 0.01162581, + "balance_loss_clip": 1.00210738, + "balance_loss_mlp": 1.00093341, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8176052437517618, + "language_loss": 0.85113573, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87424141, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.561406135559082 + }, + { + "auxiliary_loss_clip": 0.0114834, + "auxiliary_loss_mlp": 0.01162897, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00086784, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.1427165824135233, + "language_loss": 0.69496268, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71807504, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.7068824768066406 + }, + { + "auxiliary_loss_clip": 0.01149006, + "auxiliary_loss_mlp": 0.01163022, + "balance_loss_clip": 1.00243604, + "balance_loss_mlp": 1.00108838, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.8997893049442158, + "language_loss": 0.87177658, + "learning_rate": 3.845759382967026e-06, + "loss": 0.89489681, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.5958738327026367 + }, + { + "auxiliary_loss_clip": 0.01147565, + "auxiliary_loss_mlp": 0.01162462, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00071836, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 2.132680333140742, + "language_loss": 0.83422381, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85732412, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.5741775035858154 + }, + { + "auxiliary_loss_clip": 0.01148692, + "auxiliary_loss_mlp": 0.01163029, + "balance_loss_clip": 1.00234091, + "balance_loss_mlp": 1.00128627, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 5.756416013469186, + "language_loss": 0.80753797, + "learning_rate": 3.845459288641066e-06, + "loss": 0.83065516, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.558741807937622 + }, + { + "auxiliary_loss_clip": 0.01165162, + "auxiliary_loss_mlp": 0.01162634, + "balance_loss_clip": 1.0022428, + "balance_loss_mlp": 1.00098586, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.9274840300616611, + "language_loss": 0.79028517, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81356311, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.574507474899292 + }, + { + "auxiliary_loss_clip": 0.01164739, + "auxiliary_loss_mlp": 0.01162787, + "balance_loss_clip": 1.00231624, + "balance_loss_mlp": 1.0010438, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.7891289953819305, + "language_loss": 0.8803221, + "learning_rate": 3.845158914395105e-06, + "loss": 0.90359741, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.5800979137420654 + }, + { + "auxiliary_loss_clip": 0.01120591, + "auxiliary_loss_mlp": 0.01162285, + "balance_loss_clip": 1.00227034, + "balance_loss_mlp": 1.00092375, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.4272379484014186, + "language_loss": 0.78628504, + "learning_rate": 3.84500862231636e-06, + "loss": 0.8091138, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.62138032913208 + }, + { + "auxiliary_loss_clip": 0.01180819, + "auxiliary_loss_mlp": 0.0116296, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00093079, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.394581793141625, + "language_loss": 0.77082014, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79425788, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.47177791595459 + }, + { + "auxiliary_loss_clip": 0.01153219, + "auxiliary_loss_mlp": 0.01162807, + "balance_loss_clip": 1.00246763, + "balance_loss_mlp": 1.00087333, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.0573356339535143, + "language_loss": 0.78254712, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80570734, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.5574653148651123 + }, + { + "auxiliary_loss_clip": 0.01148996, + "auxiliary_loss_mlp": 0.0116346, + "balance_loss_clip": 1.00241005, + "balance_loss_mlp": 1.00162172, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.3695711882569266, + "language_loss": 0.76340473, + "learning_rate": 3.844557326325461e-06, + "loss": 0.7865293, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.5921034812927246 + }, + { + "auxiliary_loss_clip": 0.01164201, + "auxiliary_loss_mlp": 0.01163129, + "balance_loss_clip": 1.00224304, + "balance_loss_mlp": 1.00129032, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.082339090913271, + "language_loss": 0.77769899, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.80097234, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.514028787612915 + }, + { + "auxiliary_loss_clip": 0.0111569, + "auxiliary_loss_mlp": 0.01162328, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00106156, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6197398975221513, + "language_loss": 0.89745975, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92023993, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.686864137649536 + }, + { + "auxiliary_loss_clip": 0.01164326, + "auxiliary_loss_mlp": 0.0116287, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00093579, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 1.838279367318592, + "language_loss": 0.93382978, + "learning_rate": 3.844105400822391e-06, + "loss": 0.9571017, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.592017650604248 + }, + { + "auxiliary_loss_clip": 0.01147864, + "auxiliary_loss_mlp": 0.01162362, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00100076, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.7966199811720474, + "language_loss": 0.75635552, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77945781, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.6485331058502197 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01162741, + "balance_loss_clip": 1.00207365, + "balance_loss_mlp": 1.00128436, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.5411466455660863, + "language_loss": 0.81502074, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83799845, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.6207430362701416 + }, + { + "auxiliary_loss_clip": 0.01180868, + "auxiliary_loss_mlp": 0.01162906, + "balance_loss_clip": 1.00240874, + "balance_loss_mlp": 1.00116313, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.1932928011069115, + "language_loss": 0.77595723, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79939497, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.542806625366211 + }, + { + "auxiliary_loss_clip": 0.01163975, + "auxiliary_loss_mlp": 0.01162334, + "balance_loss_clip": 1.00217223, + "balance_loss_mlp": 1.00087738, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.094519456470008, + "language_loss": 0.86623132, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88949436, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.5387744903564453 + }, + { + "auxiliary_loss_clip": 0.01164165, + "auxiliary_loss_mlp": 0.01163421, + "balance_loss_clip": 1.0022614, + "balance_loss_mlp": 1.00129688, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 1.8952030018065922, + "language_loss": 0.82728106, + "learning_rate": 3.843350793153673e-06, + "loss": 0.85055691, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.52372670173645 + }, + { + "auxiliary_loss_clip": 0.01180777, + "auxiliary_loss_mlp": 0.0116282, + "balance_loss_clip": 1.0023402, + "balance_loss_mlp": 1.00107634, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 3.4167403788043957, + "language_loss": 0.71184951, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73528552, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.537574291229248 + }, + { + "auxiliary_loss_clip": 0.01148995, + "auxiliary_loss_mlp": 0.01163196, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00107145, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.9431499607147884, + "language_loss": 0.77774107, + "learning_rate": 3.843048460745779e-06, + "loss": 0.80086291, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.792762279510498 + }, + { + "auxiliary_loss_clip": 0.01120249, + "auxiliary_loss_mlp": 0.01163065, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00103533, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0936463434224737, + "language_loss": 0.74632132, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76915449, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 4.2308127880096436 + }, + { + "auxiliary_loss_clip": 0.01147549, + "auxiliary_loss_mlp": 0.01162516, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00096357, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.744777129663881, + "language_loss": 0.80793762, + "learning_rate": 3.842745848783558e-06, + "loss": 0.83103824, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.6052026748657227 + }, + { + "auxiliary_loss_clip": 0.01164029, + "auxiliary_loss_mlp": 0.01162772, + "balance_loss_clip": 1.00215113, + "balance_loss_mlp": 1.00112379, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.6505705448514323, + "language_loss": 0.75178009, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77504814, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.507673978805542 + }, + { + "auxiliary_loss_clip": 0.01168909, + "auxiliary_loss_mlp": 0.01162881, + "balance_loss_clip": 1.00239253, + "balance_loss_mlp": 1.00094736, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.3468148873148453, + "language_loss": 0.77045262, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79377055, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 3.952064037322998 + }, + { + "auxiliary_loss_clip": 0.01163577, + "auxiliary_loss_mlp": 0.01155875, + "balance_loss_clip": 1.00257301, + "balance_loss_mlp": 1.00023544, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9289130035637089, + "language_loss": 0.56667989, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58987445, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.071218490600586 + }, + { + "auxiliary_loss_clip": 0.01118772, + "auxiliary_loss_mlp": 0.01163042, + "balance_loss_clip": 1.00215459, + "balance_loss_mlp": 1.00110829, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.498832553429686, + "language_loss": 0.88534915, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90816724, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.6243114471435547 + }, + { + "auxiliary_loss_clip": 0.01164159, + "auxiliary_loss_mlp": 0.01163235, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00130153, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 2.2378763895197866, + "language_loss": 0.78349084, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80676478, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 4.1108033657073975 + }, + { + "auxiliary_loss_clip": 0.01099595, + "auxiliary_loss_mlp": 0.0116295, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00111198, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.160973641328423, + "language_loss": 0.77692568, + "learning_rate": 3.841836336030151e-06, + "loss": 0.79955113, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.6421751976013184 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01162479, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00111759, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.482542020313074, + "language_loss": 0.77007025, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79301465, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.66489839553833 + }, + { + "auxiliary_loss_clip": 0.01164517, + "auxiliary_loss_mlp": 0.00748849, + "balance_loss_clip": 1.00243938, + "balance_loss_mlp": 1.00051332, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 2.231281938308924, + "language_loss": 0.89863944, + "learning_rate": 3.84153260631005e-06, + "loss": 0.91777307, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.5698769092559814 + }, + { + "auxiliary_loss_clip": 0.0115215, + "auxiliary_loss_mlp": 0.01162727, + "balance_loss_clip": 1.0025115, + "balance_loss_mlp": 1.00107884, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.088509450839786, + "language_loss": 0.70466828, + "learning_rate": 3.841380636700468e-06, + "loss": 0.727817, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.6458845138549805 + }, + { + "auxiliary_loss_clip": 0.01147442, + "auxiliary_loss_mlp": 0.01162588, + "balance_loss_clip": 1.0021559, + "balance_loss_mlp": 1.00103521, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2903223305746927, + "language_loss": 0.92736769, + "learning_rate": 3.841228597265548e-06, + "loss": 0.95046794, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.575587034225464 + }, + { + "auxiliary_loss_clip": 0.01148196, + "auxiliary_loss_mlp": 0.01163634, + "balance_loss_clip": 1.0024606, + "balance_loss_mlp": 1.00131917, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.1783389258785393, + "language_loss": 0.6381709, + "learning_rate": 3.841076488011055e-06, + "loss": 0.66128922, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.6299781799316406 + }, + { + "auxiliary_loss_clip": 0.01147519, + "auxiliary_loss_mlp": 0.0116299, + "balance_loss_clip": 1.00221109, + "balance_loss_mlp": 1.00105596, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 2.382559431815546, + "language_loss": 0.87903786, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90214288, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.599789619445801 + }, + { + "auxiliary_loss_clip": 0.01163981, + "auxiliary_loss_mlp": 0.01162041, + "balance_loss_clip": 1.00233173, + "balance_loss_mlp": 1.00096583, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8494367000602139, + "language_loss": 0.835908, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85916829, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.514247179031372 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.00749014, + "balance_loss_clip": 1.00248492, + "balance_loss_mlp": 1.00075507, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.9343270138433217, + "language_loss": 0.75178373, + "learning_rate": 3.840619741387832e-06, + "loss": 0.77076232, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.5490152835845947 + }, + { + "auxiliary_loss_clip": 0.01114843, + "auxiliary_loss_mlp": 0.01162675, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00083661, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 1.7750137870394365, + "language_loss": 0.76014435, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78291953, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.729414463043213 + }, + { + "auxiliary_loss_clip": 0.01148131, + "auxiliary_loss_mlp": 0.01162663, + "balance_loss_clip": 1.00232768, + "balance_loss_mlp": 1.00120544, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.1380724441493433, + "language_loss": 0.71037638, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73348433, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.5919129848480225 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01162437, + "balance_loss_clip": 1.00230789, + "balance_loss_mlp": 1.00107527, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 3.1160679940249154, + "language_loss": 0.7171942, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74046791, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.576992988586426 + }, + { + "auxiliary_loss_clip": 0.01180483, + "auxiliary_loss_mlp": 0.01162198, + "balance_loss_clip": 1.00218785, + "balance_loss_mlp": 1.00102699, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.6246385081178016, + "language_loss": 0.84729093, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87071776, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01133229, + "auxiliary_loss_mlp": 0.01162427, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00106549, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.122617073710897, + "language_loss": 0.78323412, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80619067, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.6337039470672607 + }, + { + "auxiliary_loss_clip": 0.0114876, + "auxiliary_loss_mlp": 0.01162335, + "balance_loss_clip": 1.00241232, + "balance_loss_mlp": 1.00087833, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 2.633376612915832, + "language_loss": 0.70472288, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72783381, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.5988595485687256 + }, + { + "auxiliary_loss_clip": 0.01147262, + "auxiliary_loss_mlp": 0.01162498, + "balance_loss_clip": 1.00219953, + "balance_loss_mlp": 1.00104094, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7399712480735272, + "language_loss": 0.76583594, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78893352, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.5742390155792236 + }, + { + "auxiliary_loss_clip": 0.01164023, + "auxiliary_loss_mlp": 0.01162634, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00089061, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.3416106185855736, + "language_loss": 0.77528208, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7985487, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.5368974208831787 + }, + { + "auxiliary_loss_clip": 0.01151885, + "auxiliary_loss_mlp": 0.01162358, + "balance_loss_clip": 1.00248444, + "balance_loss_mlp": 1.00080526, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 2.0984393407267365, + "language_loss": 0.82332784, + "learning_rate": 3.839245733132652e-06, + "loss": 0.8464703, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.608253002166748 + }, + { + "auxiliary_loss_clip": 0.01180731, + "auxiliary_loss_mlp": 0.01162804, + "balance_loss_clip": 1.00238669, + "balance_loss_mlp": 1.00115657, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.5806370895510389, + "language_loss": 0.90460134, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92803675, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.5089480876922607 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.0116213, + "balance_loss_clip": 1.00214148, + "balance_loss_mlp": 1.00095868, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.8088334986643175, + "language_loss": 0.70214152, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72492659, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.626967668533325 + }, + { + "auxiliary_loss_clip": 0.0114835, + "auxiliary_loss_mlp": 0.01162487, + "balance_loss_clip": 1.00225043, + "balance_loss_mlp": 1.00102997, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.763359057406668, + "language_loss": 0.82792693, + "learning_rate": 3.838786474773448e-06, + "loss": 0.85103536, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.5812253952026367 + }, + { + "auxiliary_loss_clip": 0.01147192, + "auxiliary_loss_mlp": 0.01161944, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00086832, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.884644412542184, + "language_loss": 0.85167736, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87476873, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.595886468887329 + }, + { + "auxiliary_loss_clip": 0.01180491, + "auxiliary_loss_mlp": 0.01162125, + "balance_loss_clip": 1.00214994, + "balance_loss_mlp": 1.00095367, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.702995992062593, + "language_loss": 0.82123387, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84466004, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.549828290939331 + }, + { + "auxiliary_loss_clip": 0.01131429, + "auxiliary_loss_mlp": 0.011629, + "balance_loss_clip": 1.00234246, + "balance_loss_mlp": 1.00115716, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.8798957757981984, + "language_loss": 0.76491851, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78786182, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.6658334732055664 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01162171, + "balance_loss_clip": 1.00225449, + "balance_loss_mlp": 1.00100017, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.9625041907529133, + "language_loss": 0.82492536, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84787077, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.6553802490234375 + }, + { + "auxiliary_loss_clip": 0.01148384, + "auxiliary_loss_mlp": 0.01162484, + "balance_loss_clip": 1.0024035, + "balance_loss_mlp": 1.00112224, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.74355390702269, + "language_loss": 0.80843759, + "learning_rate": 3.838019649712958e-06, + "loss": 0.83154625, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.5665159225463867 + }, + { + "auxiliary_loss_clip": 0.01162565, + "auxiliary_loss_mlp": 0.01155844, + "balance_loss_clip": 1.00276971, + "balance_loss_mlp": 1.00020397, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8503691878098445, + "language_loss": 0.58825719, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.61144125, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.258944272994995 + }, + { + "auxiliary_loss_clip": 0.01132326, + "auxiliary_loss_mlp": 0.01162256, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00098956, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 2.3475972099249853, + "language_loss": 0.84814614, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87109196, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.637786865234375 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.01162729, + "balance_loss_clip": 1.00228441, + "balance_loss_mlp": 1.00127161, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.287690583932809, + "language_loss": 0.79041189, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81367916, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.546602725982666 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.01162511, + "balance_loss_clip": 1.00231719, + "balance_loss_mlp": 1.00114942, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.5033995911911262, + "language_loss": 0.76309025, + "learning_rate": 3.837404935067705e-06, + "loss": 0.7863639, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.6338961124420166 + }, + { + "auxiliary_loss_clip": 0.01165057, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_clip": 1.00246692, + "balance_loss_mlp": 1.00076282, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 1.8270192813141868, + "language_loss": 0.762133, + "learning_rate": 3.837251082205368e-06, + "loss": 0.78540397, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.541301727294922 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.0116184, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00076485, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 1.8126202272164085, + "language_loss": 0.61815989, + "learning_rate": 3.837097159674286e-06, + "loss": 0.64108157, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 3.9839859008789062 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01162196, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00102496, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.6674142331449928, + "language_loss": 0.81107438, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83418345, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.5563712120056152 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01162725, + "balance_loss_clip": 1.00235295, + "balance_loss_mlp": 1.00107729, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.7643726800080257, + "language_loss": 0.88588858, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90932286, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.5407941341400146 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01162261, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.00108957, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 8.473954464986322, + "language_loss": 0.64857948, + "learning_rate": 3.83663497412695e-06, + "loss": 0.67136544, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 4.096487283706665 + }, + { + "auxiliary_loss_clip": 0.01132715, + "auxiliary_loss_mlp": 0.01162253, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00098705, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.9714157461473274, + "language_loss": 0.8309809, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85393053, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.6465871334075928 + }, + { + "auxiliary_loss_clip": 0.0113149, + "auxiliary_loss_mlp": 0.01161988, + "balance_loss_clip": 1.00219214, + "balance_loss_mlp": 1.00100756, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1260444454460767, + "language_loss": 0.79736125, + "learning_rate": 3.836326502192077e-06, + "loss": 0.82029605, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 2.6093435287475586 + }, + { + "auxiliary_loss_clip": 0.01163705, + "auxiliary_loss_mlp": 0.01162047, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00106716, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 5.5935668832512455, + "language_loss": 0.64850342, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67176092, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 4.206108808517456 + }, + { + "auxiliary_loss_clip": 0.0114763, + "auxiliary_loss_mlp": 0.0116265, + "balance_loss_clip": 1.00222898, + "balance_loss_mlp": 1.00109792, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.207597120350559, + "language_loss": 0.82347262, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84657538, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.5793495178222656 + }, + { + "auxiliary_loss_clip": 0.01163691, + "auxiliary_loss_mlp": 0.01162063, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00108278, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 1.9931163145965118, + "language_loss": 0.72722876, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75048631, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.5229785442352295 + }, + { + "auxiliary_loss_clip": 0.01148819, + "auxiliary_loss_mlp": 0.01161593, + "balance_loss_clip": 1.00216019, + "balance_loss_mlp": 1.00080323, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.797224260633546, + "language_loss": 0.82165974, + "learning_rate": 3.835708722764952e-06, + "loss": 0.84476388, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.6009633541107178 + }, + { + "auxiliary_loss_clip": 0.01180488, + "auxiliary_loss_mlp": 0.01161762, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00097287, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.8455739796451514, + "language_loss": 0.86948562, + "learning_rate": 3.835554103867876e-06, + "loss": 0.8929081, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.4864039421081543 + }, + { + "auxiliary_loss_clip": 0.01164325, + "auxiliary_loss_mlp": 0.01161498, + "balance_loss_clip": 1.00230622, + "balance_loss_mlp": 1.00118566, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.7694483803048353, + "language_loss": 0.68358386, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70684206, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.548863649368286 + }, + { + "auxiliary_loss_clip": 0.01147555, + "auxiliary_loss_mlp": 0.0116156, + "balance_loss_clip": 1.00239849, + "balance_loss_mlp": 1.00096166, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 2.440961664119468, + "language_loss": 0.79843909, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82153028, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.616461992263794 + }, + { + "auxiliary_loss_clip": 0.01148049, + "auxiliary_loss_mlp": 0.00748727, + "balance_loss_clip": 1.00217354, + "balance_loss_mlp": 1.00045991, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.7725500811755837, + "language_loss": 0.82483447, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84380221, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.5724036693573 + }, + { + "auxiliary_loss_clip": 0.01180576, + "auxiliary_loss_mlp": 0.01162275, + "balance_loss_clip": 1.00242257, + "balance_loss_mlp": 1.00119925, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.2676614592673623, + "language_loss": 0.81630659, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83973503, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.462782382965088 + }, + { + "auxiliary_loss_clip": 0.01180408, + "auxiliary_loss_mlp": 0.00748824, + "balance_loss_clip": 1.00229764, + "balance_loss_mlp": 1.00052667, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 2.6176703085706055, + "language_loss": 0.88543159, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90472388, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.5073771476745605 + }, + { + "auxiliary_loss_clip": 0.01180598, + "auxiliary_loss_mlp": 0.01162401, + "balance_loss_clip": 1.0024159, + "balance_loss_mlp": 1.00132596, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.7826079824512373, + "language_loss": 0.78702116, + "learning_rate": 3.834624928998508e-06, + "loss": 0.81045115, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.4961090087890625 + }, + { + "auxiliary_loss_clip": 0.01131772, + "auxiliary_loss_mlp": 0.01162198, + "balance_loss_clip": 1.00221682, + "balance_loss_mlp": 1.00112259, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 2.5086161541591037, + "language_loss": 0.73833323, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76127297, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.624819755554199 + }, + { + "auxiliary_loss_clip": 0.01164549, + "auxiliary_loss_mlp": 0.01162098, + "balance_loss_clip": 1.00216568, + "balance_loss_mlp": 1.00102258, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.430653209963164, + "language_loss": 0.8741402, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.8974067, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.5061209201812744 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01162185, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00101388, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.8991835927346201, + "language_loss": 0.85282165, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87608027, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.5859861373901367 + }, + { + "auxiliary_loss_clip": 0.01164978, + "auxiliary_loss_mlp": 0.01162123, + "balance_loss_clip": 1.00231743, + "balance_loss_mlp": 1.00085711, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.410746550799754, + "language_loss": 0.73256499, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75583601, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.5923218727111816 + }, + { + "auxiliary_loss_clip": 0.01180519, + "auxiliary_loss_mlp": 0.01162005, + "balance_loss_clip": 1.00253201, + "balance_loss_mlp": 1.00112057, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.948426216874293, + "language_loss": 0.76730824, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.79073352, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.5778415203094482 + }, + { + "auxiliary_loss_clip": 0.01131224, + "auxiliary_loss_mlp": 0.0116213, + "balance_loss_clip": 1.002231, + "balance_loss_mlp": 1.0009588, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 2.8420077946002706, + "language_loss": 0.81804228, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84097582, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.625119209289551 + }, + { + "auxiliary_loss_clip": 0.01148313, + "auxiliary_loss_mlp": 0.01162051, + "balance_loss_clip": 1.00224411, + "balance_loss_mlp": 1.00116611, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.843936267586105, + "language_loss": 0.72545552, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74855918, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.582794666290283 + }, + { + "auxiliary_loss_clip": 0.01164887, + "auxiliary_loss_mlp": 0.0116145, + "balance_loss_clip": 1.00231171, + "balance_loss_mlp": 1.00094652, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 4.691794816426338, + "language_loss": 0.71978867, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74305207, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.561039924621582 + }, + { + "auxiliary_loss_clip": 0.01180517, + "auxiliary_loss_mlp": 0.01162267, + "balance_loss_clip": 1.00233448, + "balance_loss_mlp": 1.00100064, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.9482293860479922, + "language_loss": 0.72693467, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75036252, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.526732921600342 + }, + { + "auxiliary_loss_clip": 0.01164811, + "auxiliary_loss_mlp": 0.0116173, + "balance_loss_clip": 1.00235891, + "balance_loss_mlp": 1.00094032, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.0559054028557315, + "language_loss": 0.71061349, + "learning_rate": 3.833070739311887e-06, + "loss": 0.73387885, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.530640125274658 + }, + { + "auxiliary_loss_clip": 0.01136251, + "auxiliary_loss_mlp": 0.01162083, + "balance_loss_clip": 1.00248349, + "balance_loss_mlp": 1.00100768, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.8090083144620626, + "language_loss": 0.75877815, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78176153, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.612833261489868 + }, + { + "auxiliary_loss_clip": 0.01131479, + "auxiliary_loss_mlp": 0.01162177, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00138783, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.8286954285149073, + "language_loss": 0.66456139, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68749797, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.6642932891845703 + }, + { + "auxiliary_loss_clip": 0.01163961, + "auxiliary_loss_mlp": 0.0116206, + "balance_loss_clip": 1.00231445, + "balance_loss_mlp": 1.0010798, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.3054304200425055, + "language_loss": 0.75531209, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77857232, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.55781626701355 + }, + { + "auxiliary_loss_clip": 0.01164806, + "auxiliary_loss_mlp": 0.01161828, + "balance_loss_clip": 1.00259089, + "balance_loss_mlp": 1.00142002, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.9259059744119122, + "language_loss": 0.73330408, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75657046, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.561616897583008 + }, + { + "auxiliary_loss_clip": 0.01147233, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_clip": 1.00241315, + "balance_loss_mlp": 1.00120473, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 1.7223458418365116, + "language_loss": 0.72555125, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74864453, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.7127532958984375 + }, + { + "auxiliary_loss_clip": 0.01164699, + "auxiliary_loss_mlp": 0.01161882, + "balance_loss_clip": 1.0024488, + "balance_loss_mlp": 1.00090146, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.249391975458941, + "language_loss": 0.73915106, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76241684, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.5611579418182373 + }, + { + "auxiliary_loss_clip": 0.01180773, + "auxiliary_loss_mlp": 0.01162367, + "balance_loss_clip": 1.00250649, + "balance_loss_mlp": 1.00091028, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.3309122944719887, + "language_loss": 0.79101163, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81444311, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.55450177192688 + }, + { + "auxiliary_loss_clip": 0.01148315, + "auxiliary_loss_mlp": 0.01161844, + "balance_loss_clip": 1.00245357, + "balance_loss_mlp": 1.00105453, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.9536327390798902, + "language_loss": 0.76896346, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79206508, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.553819417953491 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.01161951, + "balance_loss_clip": 1.0026207, + "balance_loss_mlp": 1.00116158, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.8445274124110873, + "language_loss": 0.71049386, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73363501, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.651540994644165 + }, + { + "auxiliary_loss_clip": 0.01115711, + "auxiliary_loss_mlp": 0.01162326, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00105977, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 1.9617876109134815, + "language_loss": 0.72325605, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74603641, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.930022954940796 + }, + { + "auxiliary_loss_clip": 0.01116514, + "auxiliary_loss_mlp": 0.01161737, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.00113845, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7254595806689816, + "language_loss": 0.87592161, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89870405, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.664045572280884 + }, + { + "auxiliary_loss_clip": 0.01180597, + "auxiliary_loss_mlp": 0.01162107, + "balance_loss_clip": 1.00254965, + "balance_loss_mlp": 1.00103116, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7192400396305632, + "language_loss": 0.81521833, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83864534, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 3.952988862991333 + }, + { + "auxiliary_loss_clip": 0.01130876, + "auxiliary_loss_mlp": 0.01162034, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.0011493, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.558851368958038, + "language_loss": 0.80101752, + "learning_rate": 3.831039901828054e-06, + "loss": 0.8239466, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.6248745918273926 + }, + { + "auxiliary_loss_clip": 0.01180438, + "auxiliary_loss_mlp": 0.01162232, + "balance_loss_clip": 1.0024097, + "balance_loss_mlp": 1.00134683, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 5.091033353553146, + "language_loss": 0.80673575, + "learning_rate": 3.830883197361445e-06, + "loss": 0.83016247, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.5417346954345703 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.0116215, + "balance_loss_clip": 1.0024178, + "balance_loss_mlp": 1.00116968, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 2.2564932733639576, + "language_loss": 0.73814082, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76092589, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 4.199434995651245 + }, + { + "auxiliary_loss_clip": 0.01131618, + "auxiliary_loss_mlp": 0.0116199, + "balance_loss_clip": 1.0022788, + "balance_loss_mlp": 1.0012958, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.0238226395308514, + "language_loss": 0.85413563, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87707168, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.581101417541504 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01161533, + "balance_loss_clip": 1.0023005, + "balance_loss_mlp": 1.00093377, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.834363527233571, + "language_loss": 0.771016, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79410142, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 4.060644865036011 + }, + { + "auxiliary_loss_clip": 0.01163789, + "auxiliary_loss_mlp": 0.0116248, + "balance_loss_clip": 1.00230598, + "balance_loss_mlp": 1.00111866, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 4.570083445474021, + "language_loss": 0.73503971, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.75830245, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.519829034805298 + }, + { + "auxiliary_loss_clip": 0.01164523, + "auxiliary_loss_mlp": 0.01162302, + "balance_loss_clip": 1.00236118, + "balance_loss_mlp": 1.00103593, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 2.0785238448868326, + "language_loss": 0.84327507, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.86654329, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.5433757305145264 + }, + { + "auxiliary_loss_clip": 0.0118032, + "auxiliary_loss_mlp": 0.01161136, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00082362, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.6588806584240132, + "language_loss": 0.78702503, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.81043959, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.5082168579101562 + }, + { + "auxiliary_loss_clip": 0.01163794, + "auxiliary_loss_mlp": 0.01162409, + "balance_loss_clip": 1.00233543, + "balance_loss_mlp": 1.00114274, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 1.9286976132717475, + "language_loss": 0.8302567, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85351878, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.5085952281951904 + }, + { + "auxiliary_loss_clip": 0.0118062, + "auxiliary_loss_mlp": 0.01162096, + "balance_loss_clip": 1.00249624, + "balance_loss_mlp": 1.00111556, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.690850620991481, + "language_loss": 0.77394992, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79737711, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.543459415435791 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.00749035, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00100529, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.017444893124673, + "language_loss": 0.89439559, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91319287, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.622514009475708 + }, + { + "auxiliary_loss_clip": 0.01098908, + "auxiliary_loss_mlp": 0.01162084, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00119901, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.302796538223696, + "language_loss": 0.75313038, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77574027, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.6759233474731445 + }, + { + "auxiliary_loss_clip": 0.0113045, + "auxiliary_loss_mlp": 0.01161894, + "balance_loss_clip": 1.00199711, + "balance_loss_mlp": 1.00110507, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.0906965632025307, + "language_loss": 0.72432154, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74724495, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.747997760772705 + }, + { + "auxiliary_loss_clip": 0.01163784, + "auxiliary_loss_mlp": 0.0116179, + "balance_loss_clip": 1.00225067, + "balance_loss_mlp": 1.00109625, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.7889173172452615, + "language_loss": 0.78094774, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80420351, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.5943095684051514 + }, + { + "auxiliary_loss_clip": 0.01131091, + "auxiliary_loss_mlp": 0.011618, + "balance_loss_clip": 1.00222874, + "balance_loss_mlp": 1.00129652, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.911818966758211, + "language_loss": 0.75809085, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78101975, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 2.678724765777588 + }, + { + "auxiliary_loss_clip": 0.01097675, + "auxiliary_loss_mlp": 0.01162064, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00146496, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 3.145601848957188, + "language_loss": 0.81005621, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83265358, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.6742196083068848 + }, + { + "auxiliary_loss_clip": 0.01148509, + "auxiliary_loss_mlp": 0.01161568, + "balance_loss_clip": 1.00228286, + "balance_loss_mlp": 1.00115979, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4105121348638547, + "language_loss": 0.67169434, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69479513, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.6453535556793213 + }, + { + "auxiliary_loss_clip": 0.01148919, + "auxiliary_loss_mlp": 0.01162426, + "balance_loss_clip": 1.00231516, + "balance_loss_mlp": 1.00125492, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.217469199142428, + "language_loss": 0.75190055, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77501404, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.6130900382995605 + }, + { + "auxiliary_loss_clip": 0.01164454, + "auxiliary_loss_mlp": 0.01161574, + "balance_loss_clip": 1.00246155, + "balance_loss_mlp": 1.00116563, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 1.728688754705013, + "language_loss": 0.70597017, + "learning_rate": 3.828208603915186e-06, + "loss": 0.7292304, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.570852518081665 + }, + { + "auxiliary_loss_clip": 0.01180427, + "auxiliary_loss_mlp": 0.01161269, + "balance_loss_clip": 1.0025419, + "balance_loss_mlp": 1.00086105, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 1.987524009004612, + "language_loss": 0.79249716, + "learning_rate": 3.828050650669353e-06, + "loss": 0.81591415, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.5102412700653076 + }, + { + "auxiliary_loss_clip": 0.01163762, + "auxiliary_loss_mlp": 0.01161791, + "balance_loss_clip": 1.00227237, + "balance_loss_mlp": 1.00119281, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 2.1002376079913905, + "language_loss": 0.81546116, + "learning_rate": 3.827892628103657e-06, + "loss": 0.83871669, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.573092222213745 + }, + { + "auxiliary_loss_clip": 0.01180362, + "auxiliary_loss_mlp": 0.01161805, + "balance_loss_clip": 1.00228059, + "balance_loss_mlp": 1.00111103, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.4049255644028826, + "language_loss": 0.70104456, + "learning_rate": 3.827734536224087e-06, + "loss": 0.7244662, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.576080322265625 + }, + { + "auxiliary_loss_clip": 0.01147624, + "auxiliary_loss_mlp": 0.01161704, + "balance_loss_clip": 1.00236022, + "balance_loss_mlp": 1.0012002, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.2346604933973038, + "language_loss": 0.62252671, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64561999, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.560007095336914 + }, + { + "auxiliary_loss_clip": 0.01180382, + "auxiliary_loss_mlp": 0.01161236, + "balance_loss_clip": 1.002496, + "balance_loss_mlp": 1.00101912, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.6898754623328327, + "language_loss": 0.89642596, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91984212, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.47055983543396 + }, + { + "auxiliary_loss_clip": 0.01180406, + "auxiliary_loss_mlp": 0.01161659, + "balance_loss_clip": 1.0025363, + "balance_loss_mlp": 1.00106049, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8421680186358675, + "language_loss": 0.91595149, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93937218, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.4817922115325928 + }, + { + "auxiliary_loss_clip": 0.01082429, + "auxiliary_loss_mlp": 0.01162666, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00120866, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 2.5863195379124093, + "language_loss": 0.7207638, + "learning_rate": 3.827101475687033e-06, + "loss": 0.74321485, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.7156546115875244 + }, + { + "auxiliary_loss_clip": 0.011646, + "auxiliary_loss_mlp": 0.01161332, + "balance_loss_clip": 1.00241232, + "balance_loss_mlp": 1.00092435, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.9638537311099367, + "language_loss": 0.71159995, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73485923, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.5382611751556396 + }, + { + "auxiliary_loss_clip": 0.01131734, + "auxiliary_loss_mlp": 0.00749179, + "balance_loss_clip": 1.00243139, + "balance_loss_mlp": 1.00126803, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 4.181740120743541, + "language_loss": 0.80409932, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82290846, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.699207067489624 + }, + { + "auxiliary_loss_clip": 0.01147626, + "auxiliary_loss_mlp": 0.00749162, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00133955, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 3.185808568575753, + "language_loss": 0.69942164, + "learning_rate": 3.826625952782601e-06, + "loss": 0.71838951, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.611527442932129 + }, + { + "auxiliary_loss_clip": 0.01163644, + "auxiliary_loss_mlp": 0.01161205, + "balance_loss_clip": 1.00228953, + "balance_loss_mlp": 1.00079679, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.775313422920772, + "language_loss": 0.77148837, + "learning_rate": 3.826467306608095e-06, + "loss": 0.79473686, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.6048390865325928 + }, + { + "auxiliary_loss_clip": 0.01131326, + "auxiliary_loss_mlp": 0.01161286, + "balance_loss_clip": 1.00222564, + "balance_loss_mlp": 1.0009737, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.925198265029419, + "language_loss": 0.81715482, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84008092, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.7411186695098877 + }, + { + "auxiliary_loss_clip": 0.01131235, + "auxiliary_loss_mlp": 0.01161409, + "balance_loss_clip": 1.00204623, + "balance_loss_mlp": 1.00090599, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.169544028415983, + "language_loss": 0.73525536, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75818181, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.7002594470977783 + }, + { + "auxiliary_loss_clip": 0.01130222, + "auxiliary_loss_mlp": 0.01160997, + "balance_loss_clip": 1.00226831, + "balance_loss_mlp": 1.00106621, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.8802584462598315, + "language_loss": 0.77666169, + "learning_rate": 3.825990952549713e-06, + "loss": 0.7995739, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.9329628944396973 + }, + { + "auxiliary_loss_clip": 0.0116462, + "auxiliary_loss_mlp": 0.01161551, + "balance_loss_clip": 1.00241911, + "balance_loss_mlp": 1.00104737, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.748155683321082, + "language_loss": 0.74794841, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77121013, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.5231716632843018 + }, + { + "auxiliary_loss_clip": 0.01130979, + "auxiliary_loss_mlp": 0.01161294, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00098157, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.640294364593125, + "language_loss": 0.75221086, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77513361, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.7404682636260986 + }, + { + "auxiliary_loss_clip": 0.01131741, + "auxiliary_loss_mlp": 0.01162171, + "balance_loss_clip": 1.00222909, + "balance_loss_mlp": 1.00138175, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.358214490790971, + "language_loss": 0.90706033, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92999947, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.617201328277588 + }, + { + "auxiliary_loss_clip": 0.01114702, + "auxiliary_loss_mlp": 0.01161912, + "balance_loss_clip": 1.00220168, + "balance_loss_mlp": 1.00121844, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.8280995578970007, + "language_loss": 0.78086019, + "learning_rate": 3.82535484444872e-06, + "loss": 0.8036263, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 4.149913787841797 + }, + { + "auxiliary_loss_clip": 0.0114802, + "auxiliary_loss_mlp": 0.0074929, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00147295, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7237792237061331, + "language_loss": 0.74282122, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76179433, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.648967742919922 + }, + { + "auxiliary_loss_clip": 0.01148089, + "auxiliary_loss_mlp": 0.00749339, + "balance_loss_clip": 1.00238562, + "balance_loss_mlp": 1.00165582, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 1.7031042396984282, + "language_loss": 0.81964314, + "learning_rate": 3.825036375068263e-06, + "loss": 0.83861744, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.61128306388855 + }, + { + "auxiliary_loss_clip": 0.01115667, + "auxiliary_loss_mlp": 0.01161537, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00093865, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 3.2848600309910445, + "language_loss": 0.79885215, + "learning_rate": 3.824877036566672e-06, + "loss": 0.82162416, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 4.045126914978027 + }, + { + "auxiliary_loss_clip": 0.0116459, + "auxiliary_loss_mlp": 0.01161484, + "balance_loss_clip": 1.00230527, + "balance_loss_mlp": 1.00117159, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.6896256419181344, + "language_loss": 0.93958706, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96284777, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.5465493202209473 + }, + { + "auxiliary_loss_clip": 0.01131139, + "auxiliary_loss_mlp": 0.01161181, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00086784, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.2310251307831375, + "language_loss": 0.84863198, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87155509, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.617913007736206 + }, + { + "auxiliary_loss_clip": 0.01148064, + "auxiliary_loss_mlp": 0.00749343, + "balance_loss_clip": 1.00227821, + "balance_loss_mlp": 1.00145483, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 2.05842410461434, + "language_loss": 0.8139925, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83296651, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 4.029237270355225 + }, + { + "auxiliary_loss_clip": 0.01180274, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_clip": 1.00239241, + "balance_loss_mlp": 1.00096416, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.9238898626667869, + "language_loss": 0.74099803, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76441455, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.495626211166382 + }, + { + "auxiliary_loss_clip": 0.01163486, + "auxiliary_loss_mlp": 0.01161445, + "balance_loss_clip": 1.00214911, + "balance_loss_mlp": 1.00113261, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6521010278338017, + "language_loss": 0.77534068, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79859006, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.5800364017486572 + }, + { + "auxiliary_loss_clip": 0.01163108, + "auxiliary_loss_mlp": 0.01155407, + "balance_loss_clip": 1.00352097, + "balance_loss_mlp": 1.00052989, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8123468768177736, + "language_loss": 0.55566841, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57885355, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.0392885208129883 + }, + { + "auxiliary_loss_clip": 0.01163584, + "auxiliary_loss_mlp": 0.01160811, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00078428, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 4.807617454721953, + "language_loss": 0.77355397, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79679787, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.5943713188171387 + }, + { + "auxiliary_loss_clip": 0.01164847, + "auxiliary_loss_mlp": 0.01161286, + "balance_loss_clip": 1.00244498, + "balance_loss_mlp": 1.00097299, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 2.185704126994924, + "language_loss": 0.64398992, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66725117, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.692237615585327 + }, + { + "auxiliary_loss_clip": 0.01164057, + "auxiliary_loss_mlp": 0.01161145, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.00064194, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 3.164437520119384, + "language_loss": 0.85169047, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87494254, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.581935405731201 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.01160917, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.0009861, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.427470881810624, + "language_loss": 0.72571278, + "learning_rate": 3.823279846575403e-06, + "loss": 0.74863768, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.6599514484405518 + }, + { + "auxiliary_loss_clip": 0.0116335, + "auxiliary_loss_mlp": 0.0116076, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00073361, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.5738397325189528, + "language_loss": 0.84361488, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86685598, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.534954309463501 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01161485, + "balance_loss_clip": 1.00251496, + "balance_loss_mlp": 1.00107741, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.9619053517755498, + "language_loss": 0.82813269, + "learning_rate": 3.822959578715685e-06, + "loss": 0.85107303, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.7682077884674072 + }, + { + "auxiliary_loss_clip": 0.01163524, + "auxiliary_loss_mlp": 0.0116083, + "balance_loss_clip": 1.00232697, + "balance_loss_mlp": 1.00089908, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.815033830870172, + "language_loss": 0.73436284, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75760639, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.588919162750244 + }, + { + "auxiliary_loss_clip": 0.0115181, + "auxiliary_loss_mlp": 0.01160773, + "balance_loss_clip": 1.00249076, + "balance_loss_mlp": 1.0009371, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.6316283409676404, + "language_loss": 0.76338744, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78651327, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.6997504234313965 + }, + { + "auxiliary_loss_clip": 0.0116358, + "auxiliary_loss_mlp": 0.01160848, + "balance_loss_clip": 1.00236487, + "balance_loss_mlp": 1.00082135, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.9028847255686676, + "language_loss": 0.70694429, + "learning_rate": 3.822478658490228e-06, + "loss": 0.73018861, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.636627197265625 + }, + { + "auxiliary_loss_clip": 0.01130263, + "auxiliary_loss_mlp": 0.0074842, + "balance_loss_clip": 1.00322628, + "balance_loss_mlp": 1.00062585, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7756071443729243, + "language_loss": 0.51801264, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53679949, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.2542388439178467 + }, + { + "auxiliary_loss_clip": 0.01147437, + "auxiliary_loss_mlp": 0.01160889, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.0008626, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.7214204758306746, + "language_loss": 0.80312538, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82620859, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.586808204650879 + }, + { + "auxiliary_loss_clip": 0.0114675, + "auxiliary_loss_mlp": 0.01161, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 1.00125933, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 2.3894546004818644, + "language_loss": 0.68979496, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71287245, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.633755922317505 + }, + { + "auxiliary_loss_clip": 0.01148358, + "auxiliary_loss_mlp": 0.01161008, + "balance_loss_clip": 1.00237429, + "balance_loss_mlp": 1.00088573, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.7404517915197493, + "language_loss": 0.88121003, + "learning_rate": 3.821836464031348e-06, + "loss": 0.90430373, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.01180212, + "auxiliary_loss_mlp": 0.01161647, + "balance_loss_clip": 1.00241995, + "balance_loss_mlp": 1.00133455, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 2.045121428746417, + "language_loss": 0.74588239, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76930094, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.6491096019744873 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.00749326, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00151598, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.7410461029200501, + "language_loss": 0.7027905, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72176206, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.6990926265716553 + }, + { + "auxiliary_loss_clip": 0.01131544, + "auxiliary_loss_mlp": 0.0116089, + "balance_loss_clip": 1.00223112, + "balance_loss_mlp": 1.00115001, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8858548129879527, + "language_loss": 0.71935058, + "learning_rate": 3.821354092781567e-06, + "loss": 0.74227488, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 2.6772091388702393 + }, + { + "auxiliary_loss_clip": 0.01168349, + "auxiliary_loss_mlp": 0.01161207, + "balance_loss_clip": 1.00246453, + "balance_loss_mlp": 1.00108457, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.806982476686659, + "language_loss": 0.81667256, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83996814, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.550849199295044 + }, + { + "auxiliary_loss_clip": 0.01163208, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.0009191, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 2.340215888937148, + "language_loss": 0.71518606, + "learning_rate": 3.821032166608568e-06, + "loss": 0.73842669, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.542325735092163 + }, + { + "auxiliary_loss_clip": 0.01131343, + "auxiliary_loss_mlp": 0.01160884, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00095284, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.832575438167607, + "language_loss": 0.75961453, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78253686, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.7034969329833984 + }, + { + "auxiliary_loss_clip": 0.01180154, + "auxiliary_loss_mlp": 0.01160621, + "balance_loss_clip": 1.00241446, + "balance_loss_mlp": 1.00116658, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.840133950086882, + "language_loss": 0.87447357, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89788133, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.5366978645324707 + }, + { + "auxiliary_loss_clip": 0.0116363, + "auxiliary_loss_mlp": 0.01160869, + "balance_loss_clip": 1.00220871, + "balance_loss_mlp": 1.00131905, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.6566443433141715, + "language_loss": 0.88148272, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90472776, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.545334577560425 + }, + { + "auxiliary_loss_clip": 0.01163396, + "auxiliary_loss_mlp": 0.01161442, + "balance_loss_clip": 1.00211179, + "balance_loss_mlp": 1.00093865, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.3731846006710486, + "language_loss": 0.82415509, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84740353, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.556358814239502 + }, + { + "auxiliary_loss_clip": 0.01180221, + "auxiliary_loss_mlp": 0.01161238, + "balance_loss_clip": 1.00231481, + "balance_loss_mlp": 1.00102091, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.119995866278303, + "language_loss": 0.81602955, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83944416, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.5861666202545166 + }, + { + "auxiliary_loss_clip": 0.01179996, + "auxiliary_loss_mlp": 0.01160863, + "balance_loss_clip": 1.00234306, + "balance_loss_mlp": 1.00140893, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.491338371987028, + "language_loss": 0.83748877, + "learning_rate": 3.820064730995783e-06, + "loss": 0.86089742, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.536306858062744 + }, + { + "auxiliary_loss_clip": 0.0113166, + "auxiliary_loss_mlp": 0.01161331, + "balance_loss_clip": 1.00210524, + "balance_loss_mlp": 1.00120926, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 2.0782441101263256, + "language_loss": 0.6944043, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71733421, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.7005369663238525 + }, + { + "auxiliary_loss_clip": 0.01163694, + "auxiliary_loss_mlp": 0.01161634, + "balance_loss_clip": 1.00236392, + "balance_loss_mlp": 1.00122547, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.0686022363993146, + "language_loss": 0.82579327, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84904653, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.526716947555542 + }, + { + "auxiliary_loss_clip": 0.01180196, + "auxiliary_loss_mlp": 0.01161143, + "balance_loss_clip": 1.00230885, + "balance_loss_mlp": 1.00111663, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 2.281531452227698, + "language_loss": 0.88511211, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90852547, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.470353603363037 + }, + { + "auxiliary_loss_clip": 0.01180003, + "auxiliary_loss_mlp": 0.01160444, + "balance_loss_clip": 1.00231075, + "balance_loss_mlp": 1.00118077, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4512692221311803, + "language_loss": 0.80777955, + "learning_rate": 3.819418393498343e-06, + "loss": 0.83118403, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.5874080657958984 + }, + { + "auxiliary_loss_clip": 0.01163356, + "auxiliary_loss_mlp": 0.01160582, + "balance_loss_clip": 1.00232816, + "balance_loss_mlp": 1.00112784, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.6983925933721946, + "language_loss": 0.77512455, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79836392, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 4.000763416290283 + }, + { + "auxiliary_loss_clip": 0.01151535, + "auxiliary_loss_mlp": 0.01160547, + "balance_loss_clip": 1.00227618, + "balance_loss_mlp": 1.00099719, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 1.7444158378361125, + "language_loss": 0.86398387, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88710469, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.5688693523406982 + }, + { + "auxiliary_loss_clip": 0.01164312, + "auxiliary_loss_mlp": 0.00749155, + "balance_loss_clip": 1.00225556, + "balance_loss_mlp": 1.00144744, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 3.3885595086512867, + "language_loss": 0.80448651, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82362115, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.589975595474243 + }, + { + "auxiliary_loss_clip": 0.01151279, + "auxiliary_loss_mlp": 0.0116101, + "balance_loss_clip": 1.00239086, + "balance_loss_mlp": 1.00107849, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 2.24598735157868, + "language_loss": 0.73205042, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75517333, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.5639123916625977 + }, + { + "auxiliary_loss_clip": 0.0116347, + "auxiliary_loss_mlp": 0.01161046, + "balance_loss_clip": 1.00222766, + "balance_loss_mlp": 1.00111508, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.9269836630621584, + "language_loss": 0.72596073, + "learning_rate": 3.81860891934076e-06, + "loss": 0.74920595, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 3.98535418510437 + }, + { + "auxiliary_loss_clip": 0.01180068, + "auxiliary_loss_mlp": 0.01160949, + "balance_loss_clip": 1.00223005, + "balance_loss_mlp": 1.00101781, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.824345749884539, + "language_loss": 0.70444858, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72785878, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.5520107746124268 + }, + { + "auxiliary_loss_clip": 0.01128443, + "auxiliary_loss_mlp": 0.01154259, + "balance_loss_clip": 1.00236988, + "balance_loss_mlp": 1.00014579, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7737426935951874, + "language_loss": 0.53332484, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55615187, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 4.66361403465271 + }, + { + "auxiliary_loss_clip": 0.01147718, + "auxiliary_loss_mlp": 0.0074936, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00158453, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 3.040312075028418, + "language_loss": 0.76590085, + "learning_rate": 3.818122407255102e-06, + "loss": 0.78487158, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.566751718521118 + }, + { + "auxiliary_loss_clip": 0.01136995, + "auxiliary_loss_mlp": 0.01160576, + "balance_loss_clip": 1.00211322, + "balance_loss_mlp": 1.00112128, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9149291449371608, + "language_loss": 0.72551286, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74848855, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.6762845516204834 + }, + { + "auxiliary_loss_clip": 0.01151426, + "auxiliary_loss_mlp": 0.01160911, + "balance_loss_clip": 1.0021497, + "balance_loss_mlp": 1.0011704, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 2.681952351804017, + "language_loss": 0.83087206, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85399538, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.564138412475586 + }, + { + "auxiliary_loss_clip": 0.01098038, + "auxiliary_loss_mlp": 0.00749257, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00147891, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.582228938388544, + "language_loss": 0.86391956, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88239253, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.694380521774292 + }, + { + "auxiliary_loss_clip": 0.01147436, + "auxiliary_loss_mlp": 0.00749184, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.0014751, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.5172927004929206, + "language_loss": 0.91669267, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93565887, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.64193058013916 + }, + { + "auxiliary_loss_clip": 0.01131927, + "auxiliary_loss_mlp": 0.01161511, + "balance_loss_clip": 1.0023849, + "balance_loss_mlp": 1.00138938, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.1098862173118773, + "language_loss": 0.81574142, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83867574, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.612558126449585 + }, + { + "auxiliary_loss_clip": 0.0114845, + "auxiliary_loss_mlp": 0.01160993, + "balance_loss_clip": 1.00212789, + "balance_loss_mlp": 1.00077629, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.8345748860827467, + "language_loss": 0.80982494, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83291942, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.5652244091033936 + }, + { + "auxiliary_loss_clip": 0.01180214, + "auxiliary_loss_mlp": 0.01161287, + "balance_loss_clip": 1.00234699, + "balance_loss_mlp": 1.00116515, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.956920160539595, + "language_loss": 0.77017754, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79359257, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.5399837493896484 + }, + { + "auxiliary_loss_clip": 0.01163849, + "auxiliary_loss_mlp": 0.01160767, + "balance_loss_clip": 1.0025177, + "balance_loss_mlp": 1.00140762, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.0675729694992824, + "language_loss": 0.79492247, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8181687, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.5288407802581787 + }, + { + "auxiliary_loss_clip": 0.01164474, + "auxiliary_loss_mlp": 0.01161357, + "balance_loss_clip": 1.00237417, + "balance_loss_mlp": 1.00133038, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.9095828545931308, + "language_loss": 0.7822268, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80548507, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.5685484409332275 + }, + { + "auxiliary_loss_clip": 0.01148184, + "auxiliary_loss_mlp": 0.01160774, + "balance_loss_clip": 1.00215757, + "balance_loss_mlp": 1.0009383, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.1836455338994694, + "language_loss": 0.81538284, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83847243, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.6225178241729736 + }, + { + "auxiliary_loss_clip": 0.01146901, + "auxiliary_loss_mlp": 0.01160975, + "balance_loss_clip": 1.00231314, + "balance_loss_mlp": 1.00113928, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8097993827013048, + "language_loss": 0.86433977, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88741851, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.682238817214966 + }, + { + "auxiliary_loss_clip": 0.01146875, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_clip": 1.00228858, + "balance_loss_mlp": 1.00105512, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.6924401174036972, + "language_loss": 0.76688391, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78996062, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.6403305530548096 + }, + { + "auxiliary_loss_clip": 0.01146817, + "auxiliary_loss_mlp": 0.01161217, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00128555, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 3.7148565591424343, + "language_loss": 0.73729283, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76037318, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.605429172515869 + }, + { + "auxiliary_loss_clip": 0.01137184, + "auxiliary_loss_mlp": 0.01160923, + "balance_loss_clip": 1.00250149, + "balance_loss_mlp": 1.00099218, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 2.6643473208227815, + "language_loss": 0.72895706, + "learning_rate": 3.815843815948507e-06, + "loss": 0.7519381, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.6514811515808105 + }, + { + "auxiliary_loss_clip": 0.01132902, + "auxiliary_loss_mlp": 0.01161303, + "balance_loss_clip": 1.00242054, + "balance_loss_mlp": 1.00108612, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.638940930429353, + "language_loss": 0.74597925, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.76892138, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.5986084938049316 + }, + { + "auxiliary_loss_clip": 0.01116156, + "auxiliary_loss_mlp": 0.01160957, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.00102568, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.8141389208141596, + "language_loss": 0.7927897, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81556082, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.6800084114074707 + }, + { + "auxiliary_loss_clip": 0.01164494, + "auxiliary_loss_mlp": 0.00749293, + "balance_loss_clip": 1.00233662, + "balance_loss_mlp": 1.0014745, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.1131453553757398, + "language_loss": 0.84949911, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86863697, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.571178674697876 + }, + { + "auxiliary_loss_clip": 0.01115303, + "auxiliary_loss_mlp": 0.01160476, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00102162, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.2834651292153834, + "language_loss": 0.71330965, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73606741, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.7047712802886963 + }, + { + "auxiliary_loss_clip": 0.01131111, + "auxiliary_loss_mlp": 0.01160989, + "balance_loss_clip": 1.00212514, + "balance_loss_mlp": 1.00124896, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.2923064939015005, + "language_loss": 0.71201521, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73493624, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.5974645614624023 + }, + { + "auxiliary_loss_clip": 0.01132335, + "auxiliary_loss_mlp": 0.01160478, + "balance_loss_clip": 1.00230193, + "balance_loss_mlp": 1.0011189, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9098601956215218, + "language_loss": 0.88678706, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90971518, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.5935983657836914 + }, + { + "auxiliary_loss_clip": 0.01163488, + "auxiliary_loss_mlp": 0.01160665, + "balance_loss_clip": 1.00235474, + "balance_loss_mlp": 1.00102043, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 2.331044709297537, + "language_loss": 0.73936141, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76260304, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.560342311859131 + }, + { + "auxiliary_loss_clip": 0.01163324, + "auxiliary_loss_mlp": 0.01160742, + "balance_loss_clip": 1.00230002, + "balance_loss_mlp": 1.00119221, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 2.1620434386501746, + "language_loss": 0.82600904, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84924966, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.5386734008789062 + }, + { + "auxiliary_loss_clip": 0.01163424, + "auxiliary_loss_mlp": 0.01160888, + "balance_loss_clip": 1.00226974, + "balance_loss_mlp": 1.00105262, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.294410714216534, + "language_loss": 0.84721148, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87045461, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.5295164585113525 + }, + { + "auxiliary_loss_clip": 0.01180039, + "auxiliary_loss_mlp": 0.0116065, + "balance_loss_clip": 1.00234365, + "balance_loss_mlp": 1.00119543, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.7898917289924263, + "language_loss": 0.7271198, + "learning_rate": 3.814207986905616e-06, + "loss": 0.75052667, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.471416711807251 + }, + { + "auxiliary_loss_clip": 0.01147173, + "auxiliary_loss_mlp": 0.0116085, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00101459, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.6319265724057825, + "language_loss": 0.74261183, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76569211, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.843549966812134 + }, + { + "auxiliary_loss_clip": 0.01133312, + "auxiliary_loss_mlp": 0.0116105, + "balance_loss_clip": 1.00237679, + "balance_loss_mlp": 1.00111866, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.0521332217182295, + "language_loss": 0.79078734, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81373096, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.6264641284942627 + }, + { + "auxiliary_loss_clip": 0.01148996, + "auxiliary_loss_mlp": 0.01161124, + "balance_loss_clip": 1.00227952, + "balance_loss_mlp": 1.00109744, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.0822239459488685, + "language_loss": 0.69446152, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71756274, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.6336889266967773 + }, + { + "auxiliary_loss_clip": 0.01147538, + "auxiliary_loss_mlp": 0.01160747, + "balance_loss_clip": 1.00219083, + "balance_loss_mlp": 1.00110173, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.8325151949039973, + "language_loss": 0.81323183, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.83631468, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.6306934356689453 + }, + { + "auxiliary_loss_clip": 0.01147292, + "auxiliary_loss_mlp": 0.01160894, + "balance_loss_clip": 1.00220418, + "balance_loss_mlp": 1.00105846, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.9052802874511907, + "language_loss": 0.82495046, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84803236, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.6945440769195557 + }, + { + "auxiliary_loss_clip": 0.01082193, + "auxiliary_loss_mlp": 0.01160477, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00102282, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.515725027813313, + "language_loss": 0.78082055, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80324721, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 4.15261697769165 + }, + { + "auxiliary_loss_clip": 0.0115165, + "auxiliary_loss_mlp": 0.01160988, + "balance_loss_clip": 1.00243664, + "balance_loss_mlp": 1.00143862, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.598428453396386, + "language_loss": 0.81398326, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83710968, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 3.0031299591064453 + }, + { + "auxiliary_loss_clip": 0.01164128, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_clip": 1.00217938, + "balance_loss_mlp": 1.00088453, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 6.356108481674843, + "language_loss": 0.86950982, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89275455, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.6108601093292236 + }, + { + "auxiliary_loss_clip": 0.01147751, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_clip": 1.00227165, + "balance_loss_mlp": 1.00115073, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.7782242153455576, + "language_loss": 0.72466749, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74775296, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.6211626529693604 + }, + { + "auxiliary_loss_clip": 0.0116341, + "auxiliary_loss_mlp": 0.01160634, + "balance_loss_clip": 1.00215781, + "balance_loss_mlp": 1.00098872, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 12.455231272949659, + "language_loss": 0.81451774, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83775818, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 4.057759761810303 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01161515, + "balance_loss_clip": 1.00234282, + "balance_loss_mlp": 1.00120199, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 14.535072856908648, + "language_loss": 0.68978387, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71273273, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.791069984436035 + }, + { + "auxiliary_loss_clip": 0.01180091, + "auxiliary_loss_mlp": 0.01160592, + "balance_loss_clip": 1.00239897, + "balance_loss_mlp": 1.00094724, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.9191591764620033, + "language_loss": 0.79950929, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82291615, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 3.962475061416626 + }, + { + "auxiliary_loss_clip": 0.01146669, + "auxiliary_loss_mlp": 0.01160644, + "balance_loss_clip": 1.00226378, + "balance_loss_mlp": 1.00118995, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 1.7448965007349924, + "language_loss": 0.84765202, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87072521, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 4.0057103633880615 + }, + { + "auxiliary_loss_clip": 0.01179902, + "auxiliary_loss_mlp": 0.0116052, + "balance_loss_clip": 1.00230408, + "balance_loss_mlp": 1.0009706, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5229214111482028, + "language_loss": 0.86151475, + "learning_rate": 3.811906270092265e-06, + "loss": 0.88491893, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.51704740524292 + }, + { + "auxiliary_loss_clip": 0.0114721, + "auxiliary_loss_mlp": 0.01159696, + "balance_loss_clip": 1.00228548, + "balance_loss_mlp": 1.00119567, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 2.7651138825428547, + "language_loss": 0.83291966, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85598868, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.6157100200653076 + }, + { + "auxiliary_loss_clip": 0.01130115, + "auxiliary_loss_mlp": 0.01160886, + "balance_loss_clip": 1.00221741, + "balance_loss_mlp": 1.00114536, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 1.8048095095416745, + "language_loss": 0.76565069, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78856075, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.6129469871520996 + }, + { + "auxiliary_loss_clip": 0.01180011, + "auxiliary_loss_mlp": 0.0116053, + "balance_loss_clip": 1.00233376, + "balance_loss_mlp": 1.00117111, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5679330891259176, + "language_loss": 0.80621421, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82961959, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.493729829788208 + }, + { + "auxiliary_loss_clip": 0.0116331, + "auxiliary_loss_mlp": 0.01160393, + "balance_loss_clip": 1.00228667, + "balance_loss_mlp": 1.00103414, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.595007691374687, + "language_loss": 0.69539559, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71863258, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.5268454551696777 + }, + { + "auxiliary_loss_clip": 0.01180061, + "auxiliary_loss_mlp": 0.00749076, + "balance_loss_clip": 1.00240374, + "balance_loss_mlp": 1.00124979, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.636342356119147, + "language_loss": 0.88346785, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90275919, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.5367462635040283 + }, + { + "auxiliary_loss_clip": 0.011629, + "auxiliary_loss_mlp": 0.01160445, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.00108635, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 1.9207391572322414, + "language_loss": 0.79411626, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81734973, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.5163931846618652 + }, + { + "auxiliary_loss_clip": 0.01163313, + "auxiliary_loss_mlp": 0.01160594, + "balance_loss_clip": 1.00239205, + "balance_loss_mlp": 1.00113964, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7476710972128047, + "language_loss": 0.94873065, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97196972, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.5652899742126465 + }, + { + "auxiliary_loss_clip": 0.01084167, + "auxiliary_loss_mlp": 0.0116095, + "balance_loss_clip": 1.00216675, + "balance_loss_mlp": 1.00120997, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.434177772667589, + "language_loss": 0.71267599, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73512715, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 3.080728530883789 + }, + { + "auxiliary_loss_clip": 0.01163523, + "auxiliary_loss_mlp": 0.01153972, + "balance_loss_clip": 1.0033716, + "balance_loss_mlp": 1.00062072, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7592695276210438, + "language_loss": 0.54056227, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56373715, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.8259334564208984 + }, + { + "auxiliary_loss_clip": 0.01179921, + "auxiliary_loss_mlp": 0.00749084, + "balance_loss_clip": 1.00237751, + "balance_loss_mlp": 1.00122356, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 2.464151337135672, + "language_loss": 0.75327772, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.535994052886963 + }, + { + "auxiliary_loss_clip": 0.01147386, + "auxiliary_loss_mlp": 0.01161113, + "balance_loss_clip": 1.00221598, + "balance_loss_mlp": 1.00127769, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.358187380667641, + "language_loss": 0.87012959, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89321458, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.5996618270874023 + }, + { + "auxiliary_loss_clip": 0.01131946, + "auxiliary_loss_mlp": 0.0116061, + "balance_loss_clip": 1.00216436, + "balance_loss_mlp": 1.00115561, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 2.0516153380978706, + "language_loss": 0.73218864, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7551142, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.740997552871704 + }, + { + "auxiliary_loss_clip": 0.01131192, + "auxiliary_loss_mlp": 0.01159872, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00108552, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.5891133460760762, + "language_loss": 0.75408477, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77699542, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.7100274562835693 + }, + { + "auxiliary_loss_clip": 0.01146779, + "auxiliary_loss_mlp": 0.01160346, + "balance_loss_clip": 1.00234699, + "balance_loss_mlp": 1.00089204, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.6557336749003666, + "language_loss": 0.85102344, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.87409467, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.6276049613952637 + }, + { + "auxiliary_loss_clip": 0.01179959, + "auxiliary_loss_mlp": 0.01160684, + "balance_loss_clip": 1.00238514, + "balance_loss_mlp": 1.00122952, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 1.7653175402376882, + "language_loss": 0.79246163, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81586814, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.5202033519744873 + }, + { + "auxiliary_loss_clip": 0.01098572, + "auxiliary_loss_mlp": 0.01160535, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00088942, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 1.994456126306864, + "language_loss": 0.74463761, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.76722872, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.660555601119995 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01160552, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.00100207, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 2.373318289551148, + "language_loss": 0.72980922, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75256342, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.660409688949585 + }, + { + "auxiliary_loss_clip": 0.01147852, + "auxiliary_loss_mlp": 0.01160472, + "balance_loss_clip": 1.00236332, + "balance_loss_mlp": 1.00101769, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.2030106644916416, + "language_loss": 0.88984776, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91293097, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.636575937271118 + }, + { + "auxiliary_loss_clip": 0.0111429, + "auxiliary_loss_mlp": 0.01160339, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00107527, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.881980110013618, + "language_loss": 0.88162202, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90436834, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.663066864013672 + }, + { + "auxiliary_loss_clip": 0.01178849, + "auxiliary_loss_mlp": 0.01153344, + "balance_loss_clip": 1.00323904, + "balance_loss_mlp": 1.0007565, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7835818350075211, + "language_loss": 0.59757662, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.62089849, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.108971118927002 + }, + { + "auxiliary_loss_clip": 0.01164323, + "auxiliary_loss_mlp": 0.01160723, + "balance_loss_clip": 1.00240362, + "balance_loss_mlp": 1.00136447, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8283084613797533, + "language_loss": 0.82365483, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84690529, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.01180169, + "auxiliary_loss_mlp": 0.01160902, + "balance_loss_clip": 1.00238419, + "balance_loss_mlp": 1.00125694, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.5531718762471214, + "language_loss": 0.70121682, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72462749, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.4919562339782715 + }, + { + "auxiliary_loss_clip": 0.01163321, + "auxiliary_loss_mlp": 0.01160559, + "balance_loss_clip": 1.00248837, + "balance_loss_mlp": 1.00120008, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.1973576604609932, + "language_loss": 0.88874876, + "learning_rate": 3.808095651090769e-06, + "loss": 0.91198754, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.5172128677368164 + }, + { + "auxiliary_loss_clip": 0.01162073, + "auxiliary_loss_mlp": 0.01153199, + "balance_loss_clip": 1.00308704, + "balance_loss_mlp": 1.00061083, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6461685430805807, + "language_loss": 0.52883673, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.5519895, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.236917734146118 + }, + { + "auxiliary_loss_clip": 0.01146846, + "auxiliary_loss_mlp": 0.0116041, + "balance_loss_clip": 1.00209415, + "balance_loss_mlp": 1.00086045, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.713343840758128, + "language_loss": 0.85372669, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87679929, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.564002752304077 + }, + { + "auxiliary_loss_clip": 0.01146283, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_clip": 1.00293016, + "balance_loss_mlp": 1.00055647, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8112712446764311, + "language_loss": 0.57483953, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59784144, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.026404857635498 + }, + { + "auxiliary_loss_clip": 0.011301, + "auxiliary_loss_mlp": 0.01152864, + "balance_loss_clip": 1.00253665, + "balance_loss_mlp": 1.00027597, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8601510497120048, + "language_loss": 0.56254131, + "learning_rate": 3.807429230178015e-06, + "loss": 0.5853709, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.031240940093994 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01160815, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00126505, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.0753794602716797, + "language_loss": 0.70472324, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72747314, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.671187400817871 + }, + { + "auxiliary_loss_clip": 0.01163837, + "auxiliary_loss_mlp": 0.01160195, + "balance_loss_clip": 1.00231135, + "balance_loss_mlp": 1.00093162, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 3.933664187914614, + "language_loss": 0.85900366, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88224399, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.6291723251342773 + }, + { + "auxiliary_loss_clip": 0.0111561, + "auxiliary_loss_mlp": 0.01159757, + "balance_loss_clip": 1.00223231, + "balance_loss_mlp": 1.00097072, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.295224786526468, + "language_loss": 0.82540882, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84816253, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 4.32736611366272 + }, + { + "auxiliary_loss_clip": 0.01131503, + "auxiliary_loss_mlp": 0.01160227, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00096321, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.3870936178120856, + "language_loss": 0.83130157, + "learning_rate": 3.806761712658952e-06, + "loss": 0.8542189, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.6710879802703857 + }, + { + "auxiliary_loss_clip": 0.01163245, + "auxiliary_loss_mlp": 0.01160042, + "balance_loss_clip": 1.00244713, + "balance_loss_mlp": 1.00106454, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 2.1203119498488636, + "language_loss": 0.80790842, + "learning_rate": 3.806594661981897e-06, + "loss": 0.83114129, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.518198251724243 + }, + { + "auxiliary_loss_clip": 0.01163629, + "auxiliary_loss_mlp": 0.01160226, + "balance_loss_clip": 1.00254631, + "balance_loss_mlp": 1.00115335, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.970904585888709, + "language_loss": 0.80471635, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82795489, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.5258920192718506 + }, + { + "auxiliary_loss_clip": 0.01164164, + "auxiliary_loss_mlp": 0.01160397, + "balance_loss_clip": 1.00225019, + "balance_loss_mlp": 1.00103819, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.839909022804504, + "language_loss": 0.85732704, + "learning_rate": 3.806260355115371e-06, + "loss": 0.88057268, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 3.9586269855499268 + }, + { + "auxiliary_loss_clip": 0.01146747, + "auxiliary_loss_mlp": 0.0116014, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00087631, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 1.9282227226958455, + "language_loss": 0.74415952, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76722836, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 4.008536100387573 + }, + { + "auxiliary_loss_clip": 0.01118892, + "auxiliary_loss_mlp": 0.00749037, + "balance_loss_clip": 1.00269878, + "balance_loss_mlp": 1.00118136, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.377270459133866, + "language_loss": 0.65881711, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67749643, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 4.115427017211914 + }, + { + "auxiliary_loss_clip": 0.01147746, + "auxiliary_loss_mlp": 0.01159853, + "balance_loss_clip": 1.00243521, + "balance_loss_mlp": 1.0009706, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.2523279081082155, + "language_loss": 0.785128, + "learning_rate": 3.805758381129643e-06, + "loss": 0.808204, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.573012113571167 + }, + { + "auxiliary_loss_clip": 0.01115502, + "auxiliary_loss_mlp": 0.01160054, + "balance_loss_clip": 1.00220013, + "balance_loss_mlp": 1.00098157, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.7681501313669539, + "language_loss": 0.7522338, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77498937, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.646644115447998 + }, + { + "auxiliary_loss_clip": 0.01135984, + "auxiliary_loss_mlp": 0.01160527, + "balance_loss_clip": 1.00257313, + "balance_loss_mlp": 1.00116777, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 2.4983666795070225, + "language_loss": 0.67816359, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70112872, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.672922134399414 + }, + { + "auxiliary_loss_clip": 0.01180061, + "auxiliary_loss_mlp": 0.01160177, + "balance_loss_clip": 1.00251496, + "balance_loss_mlp": 1.00129473, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.7304787173493883, + "language_loss": 0.70497918, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72838151, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.5168025493621826 + }, + { + "auxiliary_loss_clip": 0.01147481, + "auxiliary_loss_mlp": 0.01160516, + "balance_loss_clip": 1.00217962, + "balance_loss_mlp": 1.00106192, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.7041857599815307, + "language_loss": 0.60937035, + "learning_rate": 3.805088123868126e-06, + "loss": 0.63245022, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.6340503692626953 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01152726, + "balance_loss_clip": 1.00270915, + "balance_loss_mlp": 1.00013816, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7833088270815733, + "language_loss": 0.58768415, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.61066806, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.228363275527954 + }, + { + "auxiliary_loss_clip": 0.01164235, + "auxiliary_loss_mlp": 0.01160408, + "balance_loss_clip": 1.00241721, + "balance_loss_mlp": 1.00114393, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 2.235039059344472, + "language_loss": 0.76473784, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78798419, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.5857207775115967 + }, + { + "auxiliary_loss_clip": 0.01162794, + "auxiliary_loss_mlp": 0.01160251, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00108242, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.9485296761405446, + "language_loss": 0.77407795, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79730839, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.549267292022705 + }, + { + "auxiliary_loss_clip": 0.01146363, + "auxiliary_loss_mlp": 0.01152785, + "balance_loss_clip": 1.0027442, + "balance_loss_mlp": 1.00019729, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8551832125249941, + "language_loss": 0.59367371, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61666524, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.0158989429473877 + }, + { + "auxiliary_loss_clip": 0.0116325, + "auxiliary_loss_mlp": 0.01160487, + "balance_loss_clip": 1.0023123, + "balance_loss_mlp": 1.00131893, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.7293466807703508, + "language_loss": 0.70315653, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72639388, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.680445909500122 + }, + { + "auxiliary_loss_clip": 0.01131273, + "auxiliary_loss_mlp": 0.011601, + "balance_loss_clip": 1.00225091, + "balance_loss_mlp": 1.0010277, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 2.1077740832469933, + "language_loss": 0.79470348, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81761718, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.6252663135528564 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.011603, + "balance_loss_clip": 1.00232077, + "balance_loss_mlp": 1.00113165, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 2.1884001846725716, + "language_loss": 0.7134918, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73657441, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.6954433917999268 + }, + { + "auxiliary_loss_clip": 0.01130686, + "auxiliary_loss_mlp": 0.01160034, + "balance_loss_clip": 1.0020448, + "balance_loss_mlp": 1.00105619, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 1.8509623923105374, + "language_loss": 0.71691871, + "learning_rate": 3.803744324194691e-06, + "loss": 0.7398259, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.6333043575286865 + }, + { + "auxiliary_loss_clip": 0.01163511, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_clip": 1.00233388, + "balance_loss_mlp": 1.00107944, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.8253894607502605, + "language_loss": 0.77176678, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79500341, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.6231322288513184 + }, + { + "auxiliary_loss_clip": 0.01151468, + "auxiliary_loss_mlp": 0.01159682, + "balance_loss_clip": 1.00247073, + "balance_loss_mlp": 1.00079978, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2563134124773114, + "language_loss": 0.71782589, + "learning_rate": 3.803407690167187e-06, + "loss": 0.74093747, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.6630096435546875 + }, + { + "auxiliary_loss_clip": 0.01146664, + "auxiliary_loss_mlp": 0.01159667, + "balance_loss_clip": 1.00211072, + "balance_loss_mlp": 1.00116634, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 2.2014355392522593, + "language_loss": 0.84207463, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86513793, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.6061792373657227 + }, + { + "auxiliary_loss_clip": 0.01098201, + "auxiliary_loss_mlp": 0.01159993, + "balance_loss_clip": 1.00196826, + "balance_loss_mlp": 1.00092006, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.7809849830138167, + "language_loss": 0.81317139, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83575332, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.704463481903076 + }, + { + "auxiliary_loss_clip": 0.01163729, + "auxiliary_loss_mlp": 0.01159481, + "balance_loss_clip": 1.00240231, + "balance_loss_mlp": 1.00107563, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.3764765783215296, + "language_loss": 0.75171924, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77495134, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.563075304031372 + }, + { + "auxiliary_loss_clip": 0.01180058, + "auxiliary_loss_mlp": 0.01160398, + "balance_loss_clip": 1.0026257, + "balance_loss_mlp": 1.00123036, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.7572154794442738, + "language_loss": 0.7963028, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81970733, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.5025784969329834 + }, + { + "auxiliary_loss_clip": 0.01082801, + "auxiliary_loss_mlp": 0.01160489, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00093985, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.1358417740558284, + "language_loss": 0.70970893, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73214185, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 2.805034875869751 + }, + { + "auxiliary_loss_clip": 0.01132246, + "auxiliary_loss_mlp": 0.00749098, + "balance_loss_clip": 1.00238121, + "balance_loss_mlp": 1.00126839, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 2.146862662808509, + "language_loss": 0.84156656, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.86037993, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.5934479236602783 + }, + { + "auxiliary_loss_clip": 0.01147653, + "auxiliary_loss_mlp": 0.01160544, + "balance_loss_clip": 1.00236511, + "balance_loss_mlp": 1.0011847, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 4.764506625784826, + "language_loss": 0.82943285, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85251486, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.557776689529419 + }, + { + "auxiliary_loss_clip": 0.01163173, + "auxiliary_loss_mlp": 0.01160386, + "balance_loss_clip": 1.00243282, + "balance_loss_mlp": 1.00102687, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4844328831157023, + "language_loss": 0.8104279, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83366346, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.624439239501953 + }, + { + "auxiliary_loss_clip": 0.01164306, + "auxiliary_loss_mlp": 0.01159979, + "balance_loss_clip": 1.00259113, + "balance_loss_mlp": 1.00100112, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.435665897949785, + "language_loss": 0.76401758, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78726041, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.652672290802002 + }, + { + "auxiliary_loss_clip": 0.01130382, + "auxiliary_loss_mlp": 0.01152612, + "balance_loss_clip": 1.00265098, + "balance_loss_mlp": 1.00002372, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8262868935747243, + "language_loss": 0.55483943, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57766938, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.1775259971618652 + }, + { + "auxiliary_loss_clip": 0.01164042, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_clip": 1.00239134, + "balance_loss_mlp": 1.00098681, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 2.4341481842541697, + "language_loss": 0.72332156, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.74656159, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.5532474517822266 + }, + { + "auxiliary_loss_clip": 0.0112965, + "auxiliary_loss_mlp": 0.01159953, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00116587, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 2.69601415159657, + "language_loss": 0.69888121, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7217772, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.6386072635650635 + }, + { + "auxiliary_loss_clip": 0.0114851, + "auxiliary_loss_mlp": 0.01159837, + "balance_loss_clip": 1.00218737, + "balance_loss_mlp": 1.00114536, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.2048606800876533, + "language_loss": 0.70058239, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72366589, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.588594436645508 + }, + { + "auxiliary_loss_clip": 0.01129957, + "auxiliary_loss_mlp": 0.01159757, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00097013, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.3427247412340404, + "language_loss": 0.80515701, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82805419, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.5948233604431152 + }, + { + "auxiliary_loss_clip": 0.01163762, + "auxiliary_loss_mlp": 0.01160086, + "balance_loss_clip": 1.00235963, + "balance_loss_mlp": 1.00091743, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.1021152340542493, + "language_loss": 0.8812567, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90449512, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.5018632411956787 + }, + { + "auxiliary_loss_clip": 0.01163265, + "auxiliary_loss_mlp": 0.01159939, + "balance_loss_clip": 1.00234282, + "balance_loss_mlp": 1.00105667, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.864389159888265, + "language_loss": 0.92644405, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94967604, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.539860963821411 + }, + { + "auxiliary_loss_clip": 0.01164559, + "auxiliary_loss_mlp": 0.01160328, + "balance_loss_clip": 1.00265694, + "balance_loss_mlp": 1.00096893, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 2.649668034790838, + "language_loss": 0.78657997, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80982888, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 3.9804673194885254 + }, + { + "auxiliary_loss_clip": 0.0116313, + "auxiliary_loss_mlp": 0.0116013, + "balance_loss_clip": 1.00237441, + "balance_loss_mlp": 1.00096178, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.238479153351828, + "language_loss": 0.74994326, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7731759, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.5205183029174805 + }, + { + "auxiliary_loss_clip": 0.01147219, + "auxiliary_loss_mlp": 0.01159938, + "balance_loss_clip": 1.00230539, + "balance_loss_mlp": 1.00115156, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.1093603821316473, + "language_loss": 0.68899137, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71206295, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.522183895111084 + }, + { + "auxiliary_loss_clip": 0.01179754, + "auxiliary_loss_mlp": 0.01160305, + "balance_loss_clip": 1.00250173, + "balance_loss_mlp": 1.00123239, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 5.905072932849278, + "language_loss": 0.61808318, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64148378, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.4973177909851074 + }, + { + "auxiliary_loss_clip": 0.0114724, + "auxiliary_loss_mlp": 0.01159113, + "balance_loss_clip": 1.00221241, + "balance_loss_mlp": 1.00089872, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.6763946352910113, + "language_loss": 0.82193774, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84500128, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 3.975738763809204 + }, + { + "auxiliary_loss_clip": 0.0114671, + "auxiliary_loss_mlp": 0.01160099, + "balance_loss_clip": 1.00230122, + "balance_loss_mlp": 1.00112104, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.0341501094795067, + "language_loss": 0.87249458, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89556265, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 4.106682777404785 + }, + { + "auxiliary_loss_clip": 0.0114761, + "auxiliary_loss_mlp": 0.01159878, + "balance_loss_clip": 1.00227523, + "balance_loss_mlp": 1.0012815, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.8835705102365756, + "language_loss": 0.81660306, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83967793, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 3.961658477783203 + }, + { + "auxiliary_loss_clip": 0.01179766, + "auxiliary_loss_mlp": 0.01159865, + "balance_loss_clip": 1.00247598, + "balance_loss_mlp": 1.00107861, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.3363891028641364, + "language_loss": 0.81253266, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83592904, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.4475178718566895 + }, + { + "auxiliary_loss_clip": 0.01146773, + "auxiliary_loss_mlp": 0.01151829, + "balance_loss_clip": 1.00285101, + "balance_loss_mlp": 1.0000037, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9373443704859085, + "language_loss": 0.610717, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63370305, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 3.0632593631744385 + }, + { + "auxiliary_loss_clip": 0.0113152, + "auxiliary_loss_mlp": 0.01159588, + "balance_loss_clip": 1.00219154, + "balance_loss_mlp": 1.00089669, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.8639862961421692, + "language_loss": 0.7865907, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80950177, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.6808102130889893 + }, + { + "auxiliary_loss_clip": 0.0116444, + "auxiliary_loss_mlp": 0.01159883, + "balance_loss_clip": 1.0026058, + "balance_loss_mlp": 1.00119162, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 1.9410870234009203, + "language_loss": 0.78763485, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.81087804, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.559905767440796 + }, + { + "auxiliary_loss_clip": 0.01162861, + "auxiliary_loss_mlp": 0.00749017, + "balance_loss_clip": 1.00227273, + "balance_loss_mlp": 1.00112808, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.882994587342223, + "language_loss": 0.74881756, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.76793635, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.5834765434265137 + }, + { + "auxiliary_loss_clip": 0.01146542, + "auxiliary_loss_mlp": 0.01160267, + "balance_loss_clip": 1.00242293, + "balance_loss_mlp": 1.0011946, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9256286795126993, + "language_loss": 0.60025716, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62332535, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.743967294692993 + }, + { + "auxiliary_loss_clip": 0.01151515, + "auxiliary_loss_mlp": 0.01159606, + "balance_loss_clip": 1.00290489, + "balance_loss_mlp": 1.00110507, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.681539139588404, + "language_loss": 0.72966021, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75277144, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.6555986404418945 + }, + { + "auxiliary_loss_clip": 0.01179812, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_clip": 1.00245953, + "balance_loss_mlp": 1.0010612, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.8890513699505689, + "language_loss": 0.86066341, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.88406193, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.5007786750793457 + }, + { + "auxiliary_loss_clip": 0.01164379, + "auxiliary_loss_mlp": 0.01159951, + "balance_loss_clip": 1.00256073, + "balance_loss_mlp": 1.00106931, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.6223832514787768, + "language_loss": 0.82317001, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84641337, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.5596203804016113 + }, + { + "auxiliary_loss_clip": 0.01147762, + "auxiliary_loss_mlp": 0.01159894, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00101185, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.704881653928379, + "language_loss": 0.74305749, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76613402, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.581031322479248 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01151915, + "balance_loss_clip": 1.00297487, + "balance_loss_mlp": 1.00009012, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7529895451139968, + "language_loss": 0.5641678, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58699495, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.2888572216033936 + }, + { + "auxiliary_loss_clip": 0.01136675, + "auxiliary_loss_mlp": 0.01159782, + "balance_loss_clip": 1.00234926, + "balance_loss_mlp": 1.0009954, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7596496812146256, + "language_loss": 0.83657008, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85953462, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 2.670293092727661 + }, + { + "auxiliary_loss_clip": 0.01130761, + "auxiliary_loss_mlp": 0.01159864, + "balance_loss_clip": 1.00219369, + "balance_loss_mlp": 1.00088608, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 1.9618266294176527, + "language_loss": 0.78519827, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80810452, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.6737008094787598 + }, + { + "auxiliary_loss_clip": 0.01129878, + "auxiliary_loss_mlp": 0.01160084, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00120199, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.788849069712885, + "language_loss": 0.79423285, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81713247, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.6141281127929688 + }, + { + "auxiliary_loss_clip": 0.01147379, + "auxiliary_loss_mlp": 0.01160088, + "balance_loss_clip": 1.00230169, + "balance_loss_mlp": 1.00139701, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.6064398763266419, + "language_loss": 0.88912439, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91219914, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.600959539413452 + }, + { + "auxiliary_loss_clip": 0.0117961, + "auxiliary_loss_mlp": 0.01159646, + "balance_loss_clip": 1.00236869, + "balance_loss_mlp": 1.00114512, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.16456189657589, + "language_loss": 0.72609669, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74948925, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.6487772464752197 + }, + { + "auxiliary_loss_clip": 0.01130763, + "auxiliary_loss_mlp": 0.01160005, + "balance_loss_clip": 1.00235486, + "balance_loss_mlp": 1.00131392, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.889747315094349, + "language_loss": 0.86637163, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88927931, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.6385912895202637 + }, + { + "auxiliary_loss_clip": 0.01163279, + "auxiliary_loss_mlp": 0.01159984, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00091124, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.547118041024474, + "language_loss": 0.74067032, + "learning_rate": 3.796446484348989e-06, + "loss": 0.7639029, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.5035057067871094 + }, + { + "auxiliary_loss_clip": 0.01115458, + "auxiliary_loss_mlp": 0.01159952, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.0010699, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.162586191607239, + "language_loss": 0.80062699, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82338113, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.6339874267578125 + }, + { + "auxiliary_loss_clip": 0.01163564, + "auxiliary_loss_mlp": 0.01159166, + "balance_loss_clip": 1.00245869, + "balance_loss_mlp": 1.00104713, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.7497611324290756, + "language_loss": 0.8315292, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85475653, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.5353691577911377 + }, + { + "auxiliary_loss_clip": 0.01114057, + "auxiliary_loss_mlp": 0.01159783, + "balance_loss_clip": 1.00231767, + "balance_loss_mlp": 1.00128186, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 2.2648272529353504, + "language_loss": 0.93794733, + "learning_rate": 3.795932626406812e-06, + "loss": 0.96068573, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.673583984375 + }, + { + "auxiliary_loss_clip": 0.01147234, + "auxiliary_loss_mlp": 0.01159692, + "balance_loss_clip": 1.00233722, + "balance_loss_mlp": 1.00119114, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 1.816495236757963, + "language_loss": 0.83770192, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86077118, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.6004767417907715 + }, + { + "auxiliary_loss_clip": 0.01164015, + "auxiliary_loss_mlp": 0.01159635, + "balance_loss_clip": 1.00239563, + "balance_loss_mlp": 1.00113392, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.7572676749442386, + "language_loss": 0.76940131, + "learning_rate": 3.79558971392481e-06, + "loss": 0.79263783, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.528998851776123 + }, + { + "auxiliary_loss_clip": 0.01151218, + "auxiliary_loss_mlp": 0.01159592, + "balance_loss_clip": 1.00234365, + "balance_loss_mlp": 1.00109148, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8313840908444246, + "language_loss": 0.76774454, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79085255, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.595942735671997 + }, + { + "auxiliary_loss_clip": 0.01179464, + "auxiliary_loss_mlp": 0.01159206, + "balance_loss_clip": 1.0023669, + "balance_loss_mlp": 1.00108683, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.915416145596568, + "language_loss": 0.85664833, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88003504, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.5271031856536865 + }, + { + "auxiliary_loss_clip": 0.01179784, + "auxiliary_loss_mlp": 0.01159238, + "balance_loss_clip": 1.00265622, + "balance_loss_mlp": 1.00102305, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.709818401812651, + "language_loss": 0.68655932, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70994961, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.4879119396209717 + }, + { + "auxiliary_loss_clip": 0.0114719, + "auxiliary_loss_mlp": 0.0074912, + "balance_loss_clip": 1.00228655, + "balance_loss_mlp": 1.00103951, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.7703356921944264, + "language_loss": 0.78274369, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80170673, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.571683406829834 + }, + { + "auxiliary_loss_clip": 0.01162981, + "auxiliary_loss_mlp": 0.01159297, + "balance_loss_clip": 1.00235391, + "balance_loss_mlp": 1.00098753, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.8410707184372734, + "language_loss": 0.7791394, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8023622, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.4984049797058105 + }, + { + "auxiliary_loss_clip": 0.01162742, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_clip": 1.00218856, + "balance_loss_mlp": 1.00115323, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.6423203270426878, + "language_loss": 0.79745734, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82067645, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.5593090057373047 + }, + { + "auxiliary_loss_clip": 0.01162368, + "auxiliary_loss_mlp": 0.01159671, + "balance_loss_clip": 1.0021584, + "balance_loss_mlp": 1.00126576, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 3.6588899216096045, + "language_loss": 0.86643493, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88965529, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.495870351791382 + }, + { + "auxiliary_loss_clip": 0.01130559, + "auxiliary_loss_mlp": 0.01159325, + "balance_loss_clip": 1.00213921, + "balance_loss_mlp": 1.00101519, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.7893495062885119, + "language_loss": 0.75201553, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77491438, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.6420681476593018 + }, + { + "auxiliary_loss_clip": 0.01134195, + "auxiliary_loss_mlp": 0.01151128, + "balance_loss_clip": 1.0028162, + "balance_loss_mlp": 1.00006616, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7953587910160653, + "language_loss": 0.57553744, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59839064, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 4.637167692184448 + }, + { + "auxiliary_loss_clip": 0.01129376, + "auxiliary_loss_mlp": 0.01158836, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.0009079, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.6862879821613537, + "language_loss": 0.81380975, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83669192, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.655646800994873 + }, + { + "auxiliary_loss_clip": 0.01129963, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00118637, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 1.9982515176612292, + "language_loss": 0.93714452, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.96003437, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.6434645652770996 + }, + { + "auxiliary_loss_clip": 0.01130582, + "auxiliary_loss_mlp": 0.01159716, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00111997, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.87188407072852, + "language_loss": 0.69620395, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71910691, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.594967842102051 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.01159213, + "balance_loss_clip": 1.00195801, + "balance_loss_mlp": 1.00118899, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.6195818424328476, + "language_loss": 0.66764081, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.69038016, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 4.001284599304199 + }, + { + "auxiliary_loss_clip": 0.01147013, + "auxiliary_loss_mlp": 0.01158464, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00101256, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.5676889301758519, + "language_loss": 0.89336246, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91641724, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 5.335690975189209 + }, + { + "auxiliary_loss_clip": 0.01179381, + "auxiliary_loss_mlp": 0.01159114, + "balance_loss_clip": 1.00228596, + "balance_loss_mlp": 1.00109017, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 3.4367473169989196, + "language_loss": 0.83571947, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85910439, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.5290162563323975 + }, + { + "auxiliary_loss_clip": 0.01162819, + "auxiliary_loss_mlp": 0.01159246, + "balance_loss_clip": 1.00236154, + "balance_loss_mlp": 1.00103164, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.5099760368126565, + "language_loss": 0.86256361, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88578427, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.552351236343384 + }, + { + "auxiliary_loss_clip": 0.01162941, + "auxiliary_loss_mlp": 0.01159317, + "balance_loss_clip": 1.00241411, + "balance_loss_mlp": 1.00119805, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.024616574689773, + "language_loss": 0.78371799, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.8069405, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.5611982345581055 + }, + { + "auxiliary_loss_clip": 0.01151668, + "auxiliary_loss_mlp": 0.01159364, + "balance_loss_clip": 1.0021708, + "balance_loss_mlp": 1.00114954, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 1.9515709696527392, + "language_loss": 0.77334023, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79645061, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.564926862716675 + }, + { + "auxiliary_loss_clip": 0.01118082, + "auxiliary_loss_mlp": 0.01158926, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00099754, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 1.8672427381682783, + "language_loss": 0.77112353, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79389364, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.6616437435150146 + }, + { + "auxiliary_loss_clip": 0.01162585, + "auxiliary_loss_mlp": 0.0115922, + "balance_loss_clip": 1.00214148, + "balance_loss_mlp": 1.00119662, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 21.96143215531389, + "language_loss": 0.81748927, + "learning_rate": 3.792145618140317e-06, + "loss": 0.8407073, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.5580432415008545 + }, + { + "auxiliary_loss_clip": 0.01147149, + "auxiliary_loss_mlp": 0.01159161, + "balance_loss_clip": 1.00228643, + "balance_loss_mlp": 1.00104189, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.032981421988105, + "language_loss": 0.85893238, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.88199544, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.5514113903045654 + }, + { + "auxiliary_loss_clip": 0.01129955, + "auxiliary_loss_mlp": 0.0115851, + "balance_loss_clip": 1.00212979, + "balance_loss_mlp": 1.00086808, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.196529482533629, + "language_loss": 0.78132188, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80420661, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.655724048614502 + }, + { + "auxiliary_loss_clip": 0.01130243, + "auxiliary_loss_mlp": 0.00748956, + "balance_loss_clip": 1.00206017, + "balance_loss_mlp": 1.00092244, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7391479288630198, + "language_loss": 0.72430456, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74309659, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.6613850593566895 + }, + { + "auxiliary_loss_clip": 0.01130483, + "auxiliary_loss_mlp": 0.01159346, + "balance_loss_clip": 1.0021764, + "balance_loss_mlp": 1.00113142, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 2.069783431347332, + "language_loss": 0.72618973, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.74908805, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.6417362689971924 + }, + { + "auxiliary_loss_clip": 0.01162938, + "auxiliary_loss_mlp": 0.00748912, + "balance_loss_clip": 1.00235772, + "balance_loss_mlp": 1.00102997, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 2.3441956146887732, + "language_loss": 0.78795475, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80707324, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.547337770462036 + }, + { + "auxiliary_loss_clip": 0.01179343, + "auxiliary_loss_mlp": 0.01158977, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00114369, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 2.6915887952004076, + "language_loss": 0.80019277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82357597, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.5652220249176025 + }, + { + "auxiliary_loss_clip": 0.01145973, + "auxiliary_loss_mlp": 0.0115845, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00090361, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.75088337834618, + "language_loss": 0.79567456, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81871879, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.6877706050872803 + }, + { + "auxiliary_loss_clip": 0.01118289, + "auxiliary_loss_mlp": 0.01159377, + "balance_loss_clip": 1.00241852, + "balance_loss_mlp": 1.00106716, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.7774291596921972, + "language_loss": 0.83886164, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86163831, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.778740406036377 + }, + { + "auxiliary_loss_clip": 0.01146038, + "auxiliary_loss_mlp": 0.01158652, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00100923, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.1201186952017346, + "language_loss": 0.77832586, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.80137271, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.675377130508423 + }, + { + "auxiliary_loss_clip": 0.01179205, + "auxiliary_loss_mlp": 0.01158125, + "balance_loss_clip": 1.00239527, + "balance_loss_mlp": 1.00076878, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.687861323234447, + "language_loss": 0.77344275, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79681599, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.500783920288086 + }, + { + "auxiliary_loss_clip": 0.01147101, + "auxiliary_loss_mlp": 0.01159048, + "balance_loss_clip": 1.00219131, + "balance_loss_mlp": 1.00111938, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.388352129213766, + "language_loss": 0.74652052, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76958203, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.6698946952819824 + }, + { + "auxiliary_loss_clip": 0.01179217, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_clip": 1.00226283, + "balance_loss_mlp": 1.00090742, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8451820189484083, + "language_loss": 0.82467318, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84804893, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.509129047393799 + }, + { + "auxiliary_loss_clip": 0.01130361, + "auxiliary_loss_mlp": 0.01158033, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00086737, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.1352110881116935, + "language_loss": 0.75108337, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77396733, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.5744569301605225 + }, + { + "auxiliary_loss_clip": 0.0117933, + "auxiliary_loss_mlp": 0.01158324, + "balance_loss_clip": 1.00232136, + "balance_loss_mlp": 1.00077701, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.243363560532357, + "language_loss": 0.81048036, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83385688, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.487872838973999 + }, + { + "auxiliary_loss_clip": 0.01145568, + "auxiliary_loss_mlp": 0.01158638, + "balance_loss_clip": 1.00207984, + "balance_loss_mlp": 1.00090003, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.2440357593006666, + "language_loss": 0.87779009, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.90083218, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.5059335231781006 + }, + { + "auxiliary_loss_clip": 0.01146328, + "auxiliary_loss_mlp": 0.01158693, + "balance_loss_clip": 1.00229239, + "balance_loss_mlp": 1.00105059, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.8182928115000487, + "language_loss": 0.84668207, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86973226, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.535947322845459 + }, + { + "auxiliary_loss_clip": 0.01134425, + "auxiliary_loss_mlp": 0.01158464, + "balance_loss_clip": 1.0024699, + "balance_loss_mlp": 1.00082159, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 1.99258178762912, + "language_loss": 0.7945739, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.8175028, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.5968306064605713 + }, + { + "auxiliary_loss_clip": 0.01146728, + "auxiliary_loss_mlp": 0.01158365, + "balance_loss_clip": 1.00223541, + "balance_loss_mlp": 1.00081766, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 2.631261431064875, + "language_loss": 0.70741546, + "learning_rate": 3.78902268871344e-06, + "loss": 0.73046643, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.6248714923858643 + }, + { + "auxiliary_loss_clip": 0.01145847, + "auxiliary_loss_mlp": 0.01158428, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00107133, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4166328000389004, + "language_loss": 0.83095717, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85399991, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.53238582611084 + }, + { + "auxiliary_loss_clip": 0.01114482, + "auxiliary_loss_mlp": 0.01158266, + "balance_loss_clip": 1.00202417, + "balance_loss_mlp": 1.00081468, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 1.7974348367828532, + "language_loss": 0.80809945, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83082694, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.6642606258392334 + }, + { + "auxiliary_loss_clip": 0.01146701, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_clip": 1.00221789, + "balance_loss_mlp": 1.00099981, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 1.9283183912662303, + "language_loss": 0.77473146, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79778397, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.612584114074707 + }, + { + "auxiliary_loss_clip": 0.01114247, + "auxiliary_loss_mlp": 0.01158161, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00090039, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 2.007819508492028, + "language_loss": 0.76338565, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78610969, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.740345001220703 + }, + { + "auxiliary_loss_clip": 0.01130192, + "auxiliary_loss_mlp": 0.01157956, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00088573, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.916823427340508, + "language_loss": 0.85573226, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87861371, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.6391918659210205 + }, + { + "auxiliary_loss_clip": 0.01146578, + "auxiliary_loss_mlp": 0.00748834, + "balance_loss_clip": 1.00216424, + "balance_loss_mlp": 1.00076604, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.522624535723435, + "language_loss": 0.74185431, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76080847, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.6340034008026123 + }, + { + "auxiliary_loss_clip": 0.01147108, + "auxiliary_loss_mlp": 0.01157994, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00082874, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.6624883292621173, + "language_loss": 0.70718014, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.73023117, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.605557441711426 + }, + { + "auxiliary_loss_clip": 0.01162539, + "auxiliary_loss_mlp": 0.01158111, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.00094533, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9633877994863573, + "language_loss": 0.69376391, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71697038, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 4.017061948776245 + }, + { + "auxiliary_loss_clip": 0.0113136, + "auxiliary_loss_mlp": 0.01158176, + "balance_loss_clip": 1.00226486, + "balance_loss_mlp": 1.00100994, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.8154372550200035, + "language_loss": 0.85538965, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87828505, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.5994014739990234 + }, + { + "auxiliary_loss_clip": 0.01097435, + "auxiliary_loss_mlp": 0.01158485, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00103331, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 1.9770278527497112, + "language_loss": 0.78966856, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.81222773, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.7381348609924316 + }, + { + "auxiliary_loss_clip": 0.0112863, + "auxiliary_loss_mlp": 0.00748802, + "balance_loss_clip": 1.00193322, + "balance_loss_mlp": 1.00083661, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.0586212067457756, + "language_loss": 0.84405375, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86282808, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.6118743419647217 + }, + { + "auxiliary_loss_clip": 0.01162806, + "auxiliary_loss_mlp": 0.01158654, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.0011065, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.8580386052703908, + "language_loss": 0.82587838, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84909296, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.557534694671631 + }, + { + "auxiliary_loss_clip": 0.01132582, + "auxiliary_loss_mlp": 0.01158521, + "balance_loss_clip": 1.00220275, + "balance_loss_mlp": 1.00087821, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.06111205215385, + "language_loss": 0.81466055, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83757162, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 5.520157814025879 + }, + { + "auxiliary_loss_clip": 0.01162865, + "auxiliary_loss_mlp": 0.01158793, + "balance_loss_clip": 1.0024277, + "balance_loss_mlp": 1.00124574, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 4.07525007499524, + "language_loss": 0.74439621, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76761281, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.5846567153930664 + }, + { + "auxiliary_loss_clip": 0.011474, + "auxiliary_loss_mlp": 0.01158461, + "balance_loss_clip": 1.00222194, + "balance_loss_mlp": 1.00100935, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 1.9216494338922692, + "language_loss": 0.83083665, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85389531, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.602785587310791 + }, + { + "auxiliary_loss_clip": 0.01130534, + "auxiliary_loss_mlp": 0.01158721, + "balance_loss_clip": 1.00223207, + "balance_loss_mlp": 1.00117397, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.078786462299932, + "language_loss": 0.74191582, + "learning_rate": 3.786228297806741e-06, + "loss": 0.7648083, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.599794387817383 + }, + { + "auxiliary_loss_clip": 0.01113747, + "auxiliary_loss_mlp": 0.01150295, + "balance_loss_clip": 1.00301623, + "balance_loss_mlp": 0.99999553, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.9071121156520019, + "language_loss": 0.62787664, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.65051705, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.2944254875183105 + }, + { + "auxiliary_loss_clip": 0.01146479, + "auxiliary_loss_mlp": 0.00748802, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00075734, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 5.9279822318281905, + "language_loss": 0.76390111, + "learning_rate": 3.785877779175034e-06, + "loss": 0.78285384, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.6358563899993896 + }, + { + "auxiliary_loss_clip": 0.01162518, + "auxiliary_loss_mlp": 0.01157801, + "balance_loss_clip": 1.00215507, + "balance_loss_mlp": 1.0010165, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 2.137702673122096, + "language_loss": 0.6951189, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.7183221, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.6298089027404785 + }, + { + "auxiliary_loss_clip": 0.01147142, + "auxiliary_loss_mlp": 0.01158557, + "balance_loss_clip": 1.0024066, + "balance_loss_mlp": 1.00110507, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.073428845408769, + "language_loss": 0.76183152, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78488851, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.6182985305786133 + }, + { + "auxiliary_loss_clip": 0.01113388, + "auxiliary_loss_mlp": 0.01157735, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00095141, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 2.098976368720352, + "language_loss": 0.72542763, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74813884, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.661777973175049 + }, + { + "auxiliary_loss_clip": 0.01130954, + "auxiliary_loss_mlp": 0.007488, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00067508, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.5926703014744568, + "language_loss": 0.69818956, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71698707, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.7940118312835693 + }, + { + "auxiliary_loss_clip": 0.011472, + "auxiliary_loss_mlp": 0.01158412, + "balance_loss_clip": 1.0022521, + "balance_loss_mlp": 1.00095999, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7733467236509601, + "language_loss": 0.76257563, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78563172, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.6147990226745605 + }, + { + "auxiliary_loss_clip": 0.0116314, + "auxiliary_loss_mlp": 0.01158201, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.00113106, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.0431351271829787, + "language_loss": 0.81746233, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.84067571, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.514023542404175 + }, + { + "auxiliary_loss_clip": 0.0114599, + "auxiliary_loss_mlp": 0.01158173, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00100732, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.7135858273423814, + "language_loss": 0.73482728, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75786889, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.5431437492370605 + }, + { + "auxiliary_loss_clip": 0.01102924, + "auxiliary_loss_mlp": 0.01157598, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00090909, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 2.1846286974519944, + "language_loss": 0.64679933, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66940451, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.7061965465545654 + }, + { + "auxiliary_loss_clip": 0.0113028, + "auxiliary_loss_mlp": 0.01158342, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00089049, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.9531834641170684, + "language_loss": 0.79476726, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81765348, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.665963888168335 + }, + { + "auxiliary_loss_clip": 0.01162626, + "auxiliary_loss_mlp": 0.01157983, + "balance_loss_clip": 1.00229716, + "balance_loss_mlp": 1.00100815, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 2.1232205705861866, + "language_loss": 0.81011653, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83332264, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.565234661102295 + }, + { + "auxiliary_loss_clip": 0.01162601, + "auxiliary_loss_mlp": 0.01158234, + "balance_loss_clip": 1.00229406, + "balance_loss_mlp": 1.00116408, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.122001589037594, + "language_loss": 0.80838823, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83159661, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.4905800819396973 + }, + { + "auxiliary_loss_clip": 0.01147058, + "auxiliary_loss_mlp": 0.01158106, + "balance_loss_clip": 1.00219762, + "balance_loss_mlp": 1.00113153, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.3314244492142264, + "language_loss": 0.80618191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82923353, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.518242120742798 + }, + { + "auxiliary_loss_clip": 0.01115161, + "auxiliary_loss_mlp": 0.01158544, + "balance_loss_clip": 1.00226331, + "balance_loss_mlp": 1.00118732, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.8096242894022323, + "language_loss": 0.76765615, + "learning_rate": 3.783592807684017e-06, + "loss": 0.79039323, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.649428606033325 + }, + { + "auxiliary_loss_clip": 0.01179298, + "auxiliary_loss_mlp": 0.01158257, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00099659, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6573831343986816, + "language_loss": 0.87248683, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89586234, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.5809130668640137 + }, + { + "auxiliary_loss_clip": 0.01179285, + "auxiliary_loss_mlp": 0.00748694, + "balance_loss_clip": 1.00230598, + "balance_loss_mlp": 1.00067544, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.3221514785364867, + "language_loss": 0.89806479, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91734457, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.4763078689575195 + }, + { + "auxiliary_loss_clip": 0.01163354, + "auxiliary_loss_mlp": 0.01158426, + "balance_loss_clip": 1.00225973, + "balance_loss_mlp": 1.00087881, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.322445824550765, + "language_loss": 0.72497368, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74819148, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.5196709632873535 + }, + { + "auxiliary_loss_clip": 0.01147576, + "auxiliary_loss_mlp": 0.01157736, + "balance_loss_clip": 1.00220871, + "balance_loss_mlp": 1.00085688, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 8.835582683830467, + "language_loss": 0.69426417, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71731734, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.5659003257751465 + }, + { + "auxiliary_loss_clip": 0.01162509, + "auxiliary_loss_mlp": 0.01158004, + "balance_loss_clip": 1.00224972, + "balance_loss_mlp": 1.00083828, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.9653903414375162, + "language_loss": 0.93500727, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95821238, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.5689074993133545 + }, + { + "auxiliary_loss_clip": 0.01133673, + "auxiliary_loss_mlp": 0.01157857, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.00116801, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 3.990827052780397, + "language_loss": 0.81431961, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83723491, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.5958540439605713 + }, + { + "auxiliary_loss_clip": 0.01162678, + "auxiliary_loss_mlp": 0.01158238, + "balance_loss_clip": 1.00225759, + "balance_loss_mlp": 1.00097752, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.8937912657033857, + "language_loss": 0.73914623, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76235545, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.5439414978027344 + }, + { + "auxiliary_loss_clip": 0.01162876, + "auxiliary_loss_mlp": 0.01157537, + "balance_loss_clip": 1.00234699, + "balance_loss_mlp": 1.00084817, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 2.444158859498857, + "language_loss": 0.77218282, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79538703, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.534791946411133 + }, + { + "auxiliary_loss_clip": 0.01097394, + "auxiliary_loss_mlp": 0.01158222, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00076985, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 1.909089809355701, + "language_loss": 0.73754346, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76009965, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.7301018238067627 + }, + { + "auxiliary_loss_clip": 0.01147311, + "auxiliary_loss_mlp": 0.0115805, + "balance_loss_clip": 1.00224257, + "balance_loss_mlp": 1.00117016, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.8141374852806105, + "language_loss": 0.74184012, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76489377, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.641721725463867 + }, + { + "auxiliary_loss_clip": 0.01147749, + "auxiliary_loss_mlp": 0.01157618, + "balance_loss_clip": 1.00213349, + "balance_loss_mlp": 1.00083423, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3839047314685846, + "language_loss": 0.79565334, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81870699, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.55149245262146 + }, + { + "auxiliary_loss_clip": 0.01130455, + "auxiliary_loss_mlp": 0.01158904, + "balance_loss_clip": 1.00203657, + "balance_loss_mlp": 1.00126147, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6867238246674814, + "language_loss": 0.87971985, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.9026134, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.6346218585968018 + }, + { + "auxiliary_loss_clip": 0.01162504, + "auxiliary_loss_mlp": 0.01158094, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00102425, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 4.405851498262417, + "language_loss": 0.62525791, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.6484639, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.5772814750671387 + }, + { + "auxiliary_loss_clip": 0.0113105, + "auxiliary_loss_mlp": 0.01158113, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00094807, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.7332651149849188, + "language_loss": 0.80361181, + "learning_rate": 3.78111928675413e-06, + "loss": 0.8265034, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.6835999488830566 + }, + { + "auxiliary_loss_clip": 0.01145694, + "auxiliary_loss_mlp": 0.01158853, + "balance_loss_clip": 1.00208867, + "balance_loss_mlp": 1.00140178, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.710501247672665, + "language_loss": 0.71094078, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73398626, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 4.044670581817627 + }, + { + "auxiliary_loss_clip": 0.01129203, + "auxiliary_loss_mlp": 0.01157435, + "balance_loss_clip": 1.00212872, + "balance_loss_mlp": 1.00093663, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.8412178317528758, + "language_loss": 0.71681535, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73968172, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.645193338394165 + }, + { + "auxiliary_loss_clip": 0.01131416, + "auxiliary_loss_mlp": 0.01157956, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.0008862, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.9594900919678209, + "language_loss": 0.84989601, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87278974, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.608337879180908 + }, + { + "auxiliary_loss_clip": 0.01117772, + "auxiliary_loss_mlp": 0.01158087, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00101733, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.9229061722303789, + "language_loss": 0.72073424, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74349284, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.7554867267608643 + }, + { + "auxiliary_loss_clip": 0.01132048, + "auxiliary_loss_mlp": 0.01157457, + "balance_loss_clip": 1.00225139, + "balance_loss_mlp": 1.00086379, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 1.7761053507716922, + "language_loss": 0.83018678, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85308182, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 4.01569390296936 + }, + { + "auxiliary_loss_clip": 0.01146823, + "auxiliary_loss_mlp": 0.01157898, + "balance_loss_clip": 1.00211763, + "balance_loss_mlp": 1.00082779, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.5762817094901491, + "language_loss": 0.79561222, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81865942, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 5.535152196884155 + }, + { + "auxiliary_loss_clip": 0.01179234, + "auxiliary_loss_mlp": 0.0115837, + "balance_loss_clip": 1.00237203, + "balance_loss_mlp": 1.00101328, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 3.3933945761268545, + "language_loss": 0.76681113, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.79018712, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.538541078567505 + }, + { + "auxiliary_loss_clip": 0.01082298, + "auxiliary_loss_mlp": 0.0115765, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00077069, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.777565346998226, + "language_loss": 0.75653708, + "learning_rate": 3.779699901503696e-06, + "loss": 0.77893651, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.7227957248687744 + }, + { + "auxiliary_loss_clip": 0.01163772, + "auxiliary_loss_mlp": 0.01158162, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00080526, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.208382927840857, + "language_loss": 0.90024811, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92346746, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.495239019393921 + }, + { + "auxiliary_loss_clip": 0.01179062, + "auxiliary_loss_mlp": 0.01157821, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00094128, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 2.5767974173143555, + "language_loss": 0.88497877, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90834761, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.524495840072632 + }, + { + "auxiliary_loss_clip": 0.01145615, + "auxiliary_loss_mlp": 0.01157839, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00114989, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.5459955638397191, + "language_loss": 0.70423055, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72726512, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 2.8514010906219482 + }, + { + "auxiliary_loss_clip": 0.01130014, + "auxiliary_loss_mlp": 0.01158557, + "balance_loss_clip": 1.00208783, + "balance_loss_mlp": 1.0007242, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.050870225540141, + "language_loss": 0.69504154, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71792734, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.6178157329559326 + }, + { + "auxiliary_loss_clip": 0.01114848, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_clip": 1.00220847, + "balance_loss_mlp": 1.00083983, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 2.5031871908518757, + "language_loss": 0.71565437, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73838091, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.7397167682647705 + }, + { + "auxiliary_loss_clip": 0.01146001, + "auxiliary_loss_mlp": 0.0115832, + "balance_loss_clip": 1.0021739, + "balance_loss_mlp": 1.00105929, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.864236759878672, + "language_loss": 0.75788367, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78092682, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.5787110328674316 + }, + { + "auxiliary_loss_clip": 0.01162498, + "auxiliary_loss_mlp": 0.01158189, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00102329, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.0420022435114427, + "language_loss": 0.70448101, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.72768784, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.5507800579071045 + }, + { + "auxiliary_loss_clip": 0.01179295, + "auxiliary_loss_mlp": 0.01157703, + "balance_loss_clip": 1.00244784, + "balance_loss_mlp": 1.00082374, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.149865236407056, + "language_loss": 0.73983884, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76320881, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.4727425575256348 + }, + { + "auxiliary_loss_clip": 0.01129093, + "auxiliary_loss_mlp": 0.01158334, + "balance_loss_clip": 1.00210309, + "balance_loss_mlp": 1.00107324, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.540064517064739, + "language_loss": 0.85709155, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87996584, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.5649216175079346 + }, + { + "auxiliary_loss_clip": 0.01179198, + "auxiliary_loss_mlp": 0.01157472, + "balance_loss_clip": 1.00229561, + "balance_loss_mlp": 1.00087881, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.193980440933915, + "language_loss": 0.77012819, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79349482, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.5189976692199707 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.00748737, + "balance_loss_clip": 1.00225449, + "balance_loss_mlp": 1.00061107, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 2.0601156028277527, + "language_loss": 0.80350912, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82231468, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.7080795764923096 + }, + { + "auxiliary_loss_clip": 0.01147464, + "auxiliary_loss_mlp": 0.01158379, + "balance_loss_clip": 1.00230432, + "balance_loss_mlp": 1.00111818, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.81485292441914, + "language_loss": 0.80735767, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83041614, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.6182162761688232 + }, + { + "auxiliary_loss_clip": 0.01179037, + "auxiliary_loss_mlp": 0.01158284, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.00130916, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 3.232647700043522, + "language_loss": 0.73621714, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75959039, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.6859805583953857 + }, + { + "auxiliary_loss_clip": 0.01162553, + "auxiliary_loss_mlp": 0.011583, + "balance_loss_clip": 1.00240088, + "balance_loss_mlp": 1.00113463, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 3.424824596691311, + "language_loss": 0.78018922, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80339777, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.52549409866333 + }, + { + "auxiliary_loss_clip": 0.01131748, + "auxiliary_loss_mlp": 0.0115824, + "balance_loss_clip": 1.00231791, + "balance_loss_mlp": 1.00126481, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 2.1803729209719966, + "language_loss": 0.76033354, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78323346, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.7412405014038086 + }, + { + "auxiliary_loss_clip": 0.01163362, + "auxiliary_loss_mlp": 0.01158146, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00098014, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 3.1419129245851325, + "language_loss": 0.7289536, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75216872, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.6606292724609375 + }, + { + "auxiliary_loss_clip": 0.01167491, + "auxiliary_loss_mlp": 0.01157613, + "balance_loss_clip": 1.00247145, + "balance_loss_mlp": 1.00101972, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.911893344395306, + "language_loss": 0.82236564, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8456167, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.5602662563323975 + }, + { + "auxiliary_loss_clip": 0.01162579, + "auxiliary_loss_mlp": 0.01149623, + "balance_loss_clip": 1.00311792, + "balance_loss_mlp": 1.00008667, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7591446213202248, + "language_loss": 0.64999449, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67311645, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.20314359664917 + }, + { + "auxiliary_loss_clip": 0.01128516, + "auxiliary_loss_mlp": 0.01157556, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00096273, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 2.0742274686024116, + "language_loss": 0.83783352, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.86069417, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.6753039360046387 + }, + { + "auxiliary_loss_clip": 0.01147105, + "auxiliary_loss_mlp": 0.01158324, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00106287, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.3497503932200132, + "language_loss": 0.80427134, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82732564, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.580406904220581 + }, + { + "auxiliary_loss_clip": 0.01179005, + "auxiliary_loss_mlp": 0.01157546, + "balance_loss_clip": 1.00228143, + "balance_loss_mlp": 1.00104797, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.0205356715541667, + "language_loss": 0.79185832, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81522375, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 2.5285024642944336 + }, + { + "auxiliary_loss_clip": 0.0113096, + "auxiliary_loss_mlp": 0.01158069, + "balance_loss_clip": 1.00219488, + "balance_loss_mlp": 1.00109446, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8766603865799663, + "language_loss": 0.87883365, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90172398, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.700836420059204 + }, + { + "auxiliary_loss_clip": 0.01145978, + "auxiliary_loss_mlp": 0.01158706, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00115919, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 1.7759648534026884, + "language_loss": 0.85187125, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.8749181, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.587320566177368 + }, + { + "auxiliary_loss_clip": 0.01146822, + "auxiliary_loss_mlp": 0.01158126, + "balance_loss_clip": 1.00217283, + "balance_loss_mlp": 1.00105608, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.7514197411426657, + "language_loss": 0.7092576, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73230708, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.5777156352996826 + }, + { + "auxiliary_loss_clip": 0.01162259, + "auxiliary_loss_mlp": 0.0115803, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00143707, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 3.2098201000638364, + "language_loss": 0.83327103, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85647386, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.5514214038848877 + }, + { + "auxiliary_loss_clip": 0.01115577, + "auxiliary_loss_mlp": 0.01157566, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00106788, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.6969305237530254, + "language_loss": 0.75016677, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7728982, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.674079179763794 + }, + { + "auxiliary_loss_clip": 0.01146993, + "auxiliary_loss_mlp": 0.0115849, + "balance_loss_clip": 1.00225055, + "balance_loss_mlp": 1.00103807, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.5570237845208, + "language_loss": 0.79847503, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82152975, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.558321952819824 + }, + { + "auxiliary_loss_clip": 0.01179256, + "auxiliary_loss_mlp": 0.01158584, + "balance_loss_clip": 1.00232768, + "balance_loss_mlp": 1.00113297, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 1.8325506394092486, + "language_loss": 0.51855183, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54193026, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.4685001373291016 + }, + { + "auxiliary_loss_clip": 0.0111435, + "auxiliary_loss_mlp": 0.01158119, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00104833, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 3.2510993079529573, + "language_loss": 0.88891065, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91163534, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.6580677032470703 + }, + { + "auxiliary_loss_clip": 0.0113084, + "auxiliary_loss_mlp": 0.01158444, + "balance_loss_clip": 1.00224721, + "balance_loss_mlp": 1.00118351, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.58980455336705, + "language_loss": 0.79410827, + "learning_rate": 3.774338767820631e-06, + "loss": 0.8170011, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.6425940990448 + }, + { + "auxiliary_loss_clip": 0.01163376, + "auxiliary_loss_mlp": 0.01158603, + "balance_loss_clip": 1.00223196, + "balance_loss_mlp": 1.00096059, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.933722411865027, + "language_loss": 0.75049001, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77370983, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 3.994852304458618 + }, + { + "auxiliary_loss_clip": 0.01145901, + "auxiliary_loss_mlp": 0.01158772, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00132084, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.4490112540017366, + "language_loss": 0.78758222, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.81062901, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.6200122833251953 + }, + { + "auxiliary_loss_clip": 0.01162467, + "auxiliary_loss_mlp": 0.00748778, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00067878, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 2.040885688114086, + "language_loss": 0.81338871, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83250117, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.5845041275024414 + }, + { + "auxiliary_loss_clip": 0.01162131, + "auxiliary_loss_mlp": 0.01158497, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00114095, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 3.22277059026341, + "language_loss": 0.95109546, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.9743017, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.4713523387908936 + }, + { + "auxiliary_loss_clip": 0.01114714, + "auxiliary_loss_mlp": 0.00748708, + "balance_loss_clip": 1.00207818, + "balance_loss_mlp": 1.00064754, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.144595172024948, + "language_loss": 0.73146826, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.7501024, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 4.190002679824829 + }, + { + "auxiliary_loss_clip": 0.01148084, + "auxiliary_loss_mlp": 0.01158001, + "balance_loss_clip": 1.00231874, + "balance_loss_mlp": 1.00121641, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.031499957679669, + "language_loss": 0.77208024, + "learning_rate": 3.773259268638157e-06, + "loss": 0.7951411, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 3.9446473121643066 + }, + { + "auxiliary_loss_clip": 0.0109936, + "auxiliary_loss_mlp": 0.01158424, + "balance_loss_clip": 1.00220621, + "balance_loss_mlp": 1.00116336, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.8676631997414055, + "language_loss": 0.7588551, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78143299, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.777719497680664 + }, + { + "auxiliary_loss_clip": 0.01146076, + "auxiliary_loss_mlp": 0.01149827, + "balance_loss_clip": 1.00252628, + "balance_loss_mlp": 1.00029075, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8445305257458744, + "language_loss": 0.69063818, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71359718, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.2447996139526367 + }, + { + "auxiliary_loss_clip": 0.011471, + "auxiliary_loss_mlp": 0.01158108, + "balance_loss_clip": 1.00228107, + "balance_loss_mlp": 1.00094211, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.6748157366168015, + "language_loss": 0.67724001, + "learning_rate": 3.772718611185505e-06, + "loss": 0.70029211, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.7165112495422363 + }, + { + "auxiliary_loss_clip": 0.01118786, + "auxiliary_loss_mlp": 0.01158167, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00090623, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.5994180610225792, + "language_loss": 0.89816463, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92093414, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.705531120300293 + }, + { + "auxiliary_loss_clip": 0.01131229, + "auxiliary_loss_mlp": 0.01158621, + "balance_loss_clip": 1.00216484, + "balance_loss_mlp": 1.00126505, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.1780140326885795, + "language_loss": 0.88233161, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90523005, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.5740175247192383 + }, + { + "auxiliary_loss_clip": 0.01179217, + "auxiliary_loss_mlp": 0.01158178, + "balance_loss_clip": 1.00239789, + "balance_loss_mlp": 1.00120294, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 1.9944025723262129, + "language_loss": 0.76157457, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78494853, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.4919281005859375 + }, + { + "auxiliary_loss_clip": 0.01146123, + "auxiliary_loss_mlp": 0.01158154, + "balance_loss_clip": 1.00224054, + "balance_loss_mlp": 1.00098801, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.6753165831536188, + "language_loss": 0.74836397, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.77140677, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.5825905799865723 + }, + { + "auxiliary_loss_clip": 0.01163014, + "auxiliary_loss_mlp": 0.01157947, + "balance_loss_clip": 1.00216162, + "balance_loss_mlp": 1.00116277, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5449129010383982, + "language_loss": 0.73300654, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75621611, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.58571720123291 + }, + { + "auxiliary_loss_clip": 0.01163439, + "auxiliary_loss_mlp": 0.01157374, + "balance_loss_clip": 1.00229001, + "balance_loss_mlp": 1.00125742, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.532845236559165, + "language_loss": 0.77482611, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79803425, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.5757670402526855 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01158257, + "balance_loss_clip": 1.00226235, + "balance_loss_mlp": 1.00109172, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 3.143139555790225, + "language_loss": 0.80025357, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.823174, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.6248247623443604 + }, + { + "auxiliary_loss_clip": 0.01146784, + "auxiliary_loss_mlp": 0.0115839, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00112939, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4490912366008548, + "language_loss": 0.76963598, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.79268771, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.6426286697387695 + }, + { + "auxiliary_loss_clip": 0.0112986, + "auxiliary_loss_mlp": 0.01157596, + "balance_loss_clip": 1.00197089, + "balance_loss_mlp": 1.00109816, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.7272854578188346, + "language_loss": 0.6933372, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7162118, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.594996929168701 + }, + { + "auxiliary_loss_clip": 0.0116321, + "auxiliary_loss_mlp": 0.01158065, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00089979, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.72995442805569, + "language_loss": 0.7108714, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73408419, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.5100245475769043 + }, + { + "auxiliary_loss_clip": 0.01146011, + "auxiliary_loss_mlp": 0.01158855, + "balance_loss_clip": 1.00239301, + "balance_loss_mlp": 1.00140333, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.3070579635944704, + "language_loss": 0.82277954, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84582818, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.5386059284210205 + }, + { + "auxiliary_loss_clip": 0.01179068, + "auxiliary_loss_mlp": 0.01157787, + "balance_loss_clip": 1.00235963, + "balance_loss_mlp": 1.00100267, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.4934383646993592, + "language_loss": 0.8290807, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.8524493, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.5714495182037354 + }, + { + "auxiliary_loss_clip": 0.01163265, + "auxiliary_loss_mlp": 0.01158325, + "balance_loss_clip": 1.00229883, + "balance_loss_mlp": 1.00115967, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.2690357518485564, + "language_loss": 0.85488164, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87809759, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.539642095565796 + }, + { + "auxiliary_loss_clip": 0.01128864, + "auxiliary_loss_mlp": 0.01157603, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00072408, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.616240426300646, + "language_loss": 0.89387405, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91673875, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.7052667140960693 + }, + { + "auxiliary_loss_clip": 0.01178927, + "auxiliary_loss_mlp": 0.01157467, + "balance_loss_clip": 1.00236511, + "balance_loss_mlp": 1.00116014, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 2.1756719403795417, + "language_loss": 0.69743794, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72080183, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.5289595127105713 + }, + { + "auxiliary_loss_clip": 0.0117898, + "auxiliary_loss_mlp": 0.00748698, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00060785, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.0129176683329564, + "language_loss": 0.77711648, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79639328, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.5538549423217773 + }, + { + "auxiliary_loss_clip": 0.01179076, + "auxiliary_loss_mlp": 0.01157763, + "balance_loss_clip": 1.00236225, + "balance_loss_mlp": 1.00088394, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 1.72978138853992, + "language_loss": 0.78608501, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80945337, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.474759340286255 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.00747992, + "balance_loss_clip": 1.00216246, + "balance_loss_mlp": 1.00005484, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7518304241866847, + "language_loss": 0.62667996, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64527136, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.159273147583008 + }, + { + "auxiliary_loss_clip": 0.01146212, + "auxiliary_loss_mlp": 0.01157331, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00083351, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 2.1249993099198345, + "language_loss": 0.70441931, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72745478, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.559844732284546 + }, + { + "auxiliary_loss_clip": 0.01145762, + "auxiliary_loss_mlp": 0.01157875, + "balance_loss_clip": 1.00218976, + "balance_loss_mlp": 1.00099564, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 2.349353651384765, + "language_loss": 0.68606138, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.7090978, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.7466137409210205 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01157658, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00106514, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5603412834988069, + "language_loss": 0.82798833, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85069263, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.6771888732910156 + }, + { + "auxiliary_loss_clip": 0.01162158, + "auxiliary_loss_mlp": 0.01156895, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.0008738, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 1.8323304314151856, + "language_loss": 0.82493973, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84813023, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.523988723754883 + }, + { + "auxiliary_loss_clip": 0.01163557, + "auxiliary_loss_mlp": 0.01157735, + "balance_loss_clip": 1.00226247, + "balance_loss_mlp": 1.00095105, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7525686789296502, + "language_loss": 0.78436649, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.8075794, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.5069491863250732 + }, + { + "auxiliary_loss_clip": 0.01179125, + "auxiliary_loss_mlp": 0.01157815, + "balance_loss_clip": 1.00243354, + "balance_loss_mlp": 1.00103116, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.0325512037184743, + "language_loss": 0.80519348, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82856286, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.4953060150146484 + }, + { + "auxiliary_loss_clip": 0.01162552, + "auxiliary_loss_mlp": 0.0115825, + "balance_loss_clip": 1.00234342, + "balance_loss_mlp": 1.00127554, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.6860804421008375, + "language_loss": 0.84565562, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86886364, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.531310796737671 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01157246, + "balance_loss_clip": 1.00202167, + "balance_loss_mlp": 1.00093901, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.650526873825654, + "language_loss": 0.88159382, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90447146, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.6017673015594482 + }, + { + "auxiliary_loss_clip": 0.01146216, + "auxiliary_loss_mlp": 0.01158013, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00094295, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 1.9889735706258325, + "language_loss": 0.85184598, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87488824, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.5988566875457764 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01157704, + "balance_loss_clip": 1.00250852, + "balance_loss_mlp": 1.00111115, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8472831156992664, + "language_loss": 0.84125888, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86462641, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.558666944503784 + }, + { + "auxiliary_loss_clip": 0.01163466, + "auxiliary_loss_mlp": 0.01157512, + "balance_loss_clip": 1.0023036, + "balance_loss_mlp": 1.00101447, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.5635091086003479, + "language_loss": 0.75132227, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77453208, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 3.973623752593994 + }, + { + "auxiliary_loss_clip": 0.01146341, + "auxiliary_loss_mlp": 0.00748631, + "balance_loss_clip": 1.00229657, + "balance_loss_mlp": 1.00045085, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.640919584059962, + "language_loss": 0.71128029, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73022997, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.5843231678009033 + }, + { + "auxiliary_loss_clip": 0.01162443, + "auxiliary_loss_mlp": 0.01157817, + "balance_loss_clip": 1.00239849, + "balance_loss_mlp": 1.00112796, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.78068779546054, + "language_loss": 0.88275617, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90595871, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.5387260913848877 + }, + { + "auxiliary_loss_clip": 0.01178904, + "auxiliary_loss_mlp": 0.0115737, + "balance_loss_clip": 1.00233495, + "balance_loss_mlp": 1.00096774, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.724597321560046, + "language_loss": 0.80555379, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82891655, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.4714272022247314 + }, + { + "auxiliary_loss_clip": 0.01178956, + "auxiliary_loss_mlp": 0.0115776, + "balance_loss_clip": 1.00233865, + "balance_loss_mlp": 1.00116634, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 1.9500789882090985, + "language_loss": 0.67398334, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.6973505, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.5295896530151367 + }, + { + "auxiliary_loss_clip": 0.0116239, + "auxiliary_loss_mlp": 0.01158299, + "balance_loss_clip": 1.002298, + "balance_loss_mlp": 1.00113356, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.6168770759913293, + "language_loss": 0.85117495, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.8743819, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 5.283065319061279 + }, + { + "auxiliary_loss_clip": 0.01162184, + "auxiliary_loss_mlp": 0.01156975, + "balance_loss_clip": 1.0023129, + "balance_loss_mlp": 1.00076342, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.4440118019625956, + "language_loss": 0.83510065, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85829228, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 4.020169734954834 + }, + { + "auxiliary_loss_clip": 0.01147891, + "auxiliary_loss_mlp": 0.0115793, + "balance_loss_clip": 1.00239587, + "balance_loss_mlp": 1.00095499, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.957004216622463, + "language_loss": 0.77193964, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79499787, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.6330199241638184 + }, + { + "auxiliary_loss_clip": 0.01145924, + "auxiliary_loss_mlp": 0.01148901, + "balance_loss_clip": 1.00286889, + "balance_loss_mlp": 1.00012779, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8100185187120167, + "language_loss": 0.56913781, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59208608, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.297372579574585 + }, + { + "auxiliary_loss_clip": 0.01146814, + "auxiliary_loss_mlp": 0.0115807, + "balance_loss_clip": 1.0023005, + "balance_loss_mlp": 1.00119102, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 2.006669874175914, + "language_loss": 0.67147982, + "learning_rate": 3.765817980138021e-06, + "loss": 0.6945287, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.5699546337127686 + }, + { + "auxiliary_loss_clip": 0.01178983, + "auxiliary_loss_mlp": 0.01157716, + "balance_loss_clip": 1.00247061, + "balance_loss_mlp": 1.0010277, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 2.1122963432525546, + "language_loss": 0.7589165, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.78228348, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.524071455001831 + }, + { + "auxiliary_loss_clip": 0.01144878, + "auxiliary_loss_mlp": 0.01156915, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00089359, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6258460288434096, + "language_loss": 0.67625725, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69927514, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.548895835876465 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.00748518, + "balance_loss_clip": 1.00216305, + "balance_loss_mlp": 1.00052834, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.5740341925458436, + "language_loss": 0.71643209, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73522812, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 2.9064621925354004 + }, + { + "auxiliary_loss_clip": 0.01150772, + "auxiliary_loss_mlp": 0.01157552, + "balance_loss_clip": 1.00242805, + "balance_loss_mlp": 1.0011493, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.578912411249708, + "language_loss": 0.62894297, + "learning_rate": 3.765085966704609e-06, + "loss": 0.65202618, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.6696908473968506 + }, + { + "auxiliary_loss_clip": 0.0114669, + "auxiliary_loss_mlp": 0.01158046, + "balance_loss_clip": 1.00230169, + "balance_loss_mlp": 1.00126243, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 3.4460251527359995, + "language_loss": 0.76356709, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78661448, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.5719048976898193 + }, + { + "auxiliary_loss_clip": 0.01179232, + "auxiliary_loss_mlp": 0.01158554, + "balance_loss_clip": 1.00255013, + "balance_loss_mlp": 1.00110281, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.58916541661331, + "language_loss": 0.65975547, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68313336, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.553765296936035 + }, + { + "auxiliary_loss_clip": 0.01145527, + "auxiliary_loss_mlp": 0.00748593, + "balance_loss_clip": 1.00217974, + "balance_loss_mlp": 1.00045919, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 2.4496363249325297, + "language_loss": 0.78165704, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80059826, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.5736048221588135 + }, + { + "auxiliary_loss_clip": 0.01163492, + "auxiliary_loss_mlp": 0.01158096, + "balance_loss_clip": 1.00241899, + "balance_loss_mlp": 1.00121689, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.593604103667515, + "language_loss": 0.83470297, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.8579188, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.547996759414673 + }, + { + "auxiliary_loss_clip": 0.0116342, + "auxiliary_loss_mlp": 0.01157361, + "balance_loss_clip": 1.00240541, + "balance_loss_mlp": 1.00095797, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 1.905508902773161, + "language_loss": 0.67562044, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69882822, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.657970428466797 + }, + { + "auxiliary_loss_clip": 0.01162122, + "auxiliary_loss_mlp": 0.0074858, + "balance_loss_clip": 1.00225961, + "balance_loss_mlp": 1.00045824, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 9.842978337848024, + "language_loss": 0.76040745, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77951455, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.5571019649505615 + }, + { + "auxiliary_loss_clip": 0.01130104, + "auxiliary_loss_mlp": 0.011579, + "balance_loss_clip": 1.0023731, + "balance_loss_mlp": 1.00102103, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.2515881135072533, + "language_loss": 0.81460804, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83748806, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 2.6239967346191406 + }, + { + "auxiliary_loss_clip": 0.01145209, + "auxiliary_loss_mlp": 0.01157597, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00109923, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 1.971449699856877, + "language_loss": 0.77656418, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79959226, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.5777599811553955 + }, + { + "auxiliary_loss_clip": 0.01161464, + "auxiliary_loss_mlp": 0.01157016, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00109065, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.5395758853454615, + "language_loss": 0.84768987, + "learning_rate": 3.763435021621422e-06, + "loss": 0.8708747, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.542349338531494 + }, + { + "auxiliary_loss_clip": 0.01130618, + "auxiliary_loss_mlp": 0.01157365, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00096273, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.06625180788822, + "language_loss": 0.69475853, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71763837, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.6317219734191895 + }, + { + "auxiliary_loss_clip": 0.01147389, + "auxiliary_loss_mlp": 0.01157406, + "balance_loss_clip": 1.00224221, + "balance_loss_mlp": 1.00119424, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.6861714768267135, + "language_loss": 0.73980236, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76285034, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.577911853790283 + }, + { + "auxiliary_loss_clip": 0.01161936, + "auxiliary_loss_mlp": 0.01157372, + "balance_loss_clip": 1.00220788, + "balance_loss_mlp": 1.00096929, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 1.9545478061645603, + "language_loss": 0.88457495, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90776807, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.5059964656829834 + }, + { + "auxiliary_loss_clip": 0.01146169, + "auxiliary_loss_mlp": 0.01157442, + "balance_loss_clip": 1.00219929, + "balance_loss_mlp": 1.00142097, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.6324706031052278, + "language_loss": 0.78798693, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.811023, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.5779714584350586 + }, + { + "auxiliary_loss_clip": 0.01145854, + "auxiliary_loss_mlp": 0.01158004, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.00112426, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6408243721193285, + "language_loss": 0.75984859, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78288722, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.602810859680176 + }, + { + "auxiliary_loss_clip": 0.01178952, + "auxiliary_loss_mlp": 0.01158134, + "balance_loss_clip": 1.00232482, + "balance_loss_mlp": 1.00125504, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 3.6257993418539605, + "language_loss": 0.85246503, + "learning_rate": 3.762331382119546e-06, + "loss": 0.8758359, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.4782238006591797 + }, + { + "auxiliary_loss_clip": 0.01178925, + "auxiliary_loss_mlp": 0.01157726, + "balance_loss_clip": 1.00241733, + "balance_loss_mlp": 1.00103712, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.781985998248218, + "language_loss": 0.82768244, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85104889, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.556788921356201 + }, + { + "auxiliary_loss_clip": 0.01131058, + "auxiliary_loss_mlp": 0.01157969, + "balance_loss_clip": 1.00241995, + "balance_loss_mlp": 1.00108933, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.475962310919613, + "language_loss": 0.78368711, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80657738, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.665154218673706 + }, + { + "auxiliary_loss_clip": 0.01163508, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_clip": 1.00237882, + "balance_loss_mlp": 1.0010078, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.1883467431519756, + "language_loss": 0.84835821, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87156737, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 2.5309910774230957 + }, + { + "auxiliary_loss_clip": 0.01129897, + "auxiliary_loss_mlp": 0.0074849, + "balance_loss_clip": 1.00199687, + "balance_loss_mlp": 1.00040913, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.9321754726650344, + "language_loss": 0.79754531, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81632918, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.6107184886932373 + }, + { + "auxiliary_loss_clip": 0.0117901, + "auxiliary_loss_mlp": 0.0115802, + "balance_loss_clip": 1.00242281, + "balance_loss_mlp": 1.00114059, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 1.9438034145493697, + "language_loss": 0.81272757, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83609778, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.5677313804626465 + }, + { + "auxiliary_loss_clip": 0.01098376, + "auxiliary_loss_mlp": 0.01149085, + "balance_loss_clip": 1.0026207, + "balance_loss_mlp": 1.00031209, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8819367846594158, + "language_loss": 0.6351316, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65760624, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.2193307876586914 + }, + { + "auxiliary_loss_clip": 0.01130799, + "auxiliary_loss_mlp": 0.01157484, + "balance_loss_clip": 1.00219882, + "balance_loss_mlp": 1.00098586, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.4699943504865947, + "language_loss": 0.79926932, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.82215214, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.5791804790496826 + }, + { + "auxiliary_loss_clip": 0.01145593, + "auxiliary_loss_mlp": 0.01157125, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00119901, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.7947887730651895, + "language_loss": 0.84928238, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87230957, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.6208972930908203 + }, + { + "auxiliary_loss_clip": 0.01161913, + "auxiliary_loss_mlp": 0.01156712, + "balance_loss_clip": 1.00229847, + "balance_loss_mlp": 1.00107241, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.6345489681987777, + "language_loss": 0.7993983, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82258451, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.5843963623046875 + }, + { + "auxiliary_loss_clip": 0.01145641, + "auxiliary_loss_mlp": 0.00748546, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.0004189, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.6053142579296877, + "language_loss": 0.79919845, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81814039, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 3.941939353942871 + }, + { + "auxiliary_loss_clip": 0.0114618, + "auxiliary_loss_mlp": 0.01156932, + "balance_loss_clip": 1.00206769, + "balance_loss_mlp": 1.00091064, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.601973904409122, + "language_loss": 0.67587703, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69890809, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.671138286590576 + }, + { + "auxiliary_loss_clip": 0.01147372, + "auxiliary_loss_mlp": 0.01157059, + "balance_loss_clip": 1.00232565, + "balance_loss_mlp": 1.00094295, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.7044526955949528, + "language_loss": 0.73719609, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.76024044, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 2.857985734939575 + }, + { + "auxiliary_loss_clip": 0.01162111, + "auxiliary_loss_mlp": 0.01157114, + "balance_loss_clip": 1.00219738, + "balance_loss_mlp": 1.0009973, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.9540125128275294, + "language_loss": 0.60482717, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62801939, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.633007287979126 + }, + { + "auxiliary_loss_clip": 0.01129968, + "auxiliary_loss_mlp": 0.01157461, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00115407, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.5993447362427202, + "language_loss": 0.60162455, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62449884, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 2.893779754638672 + }, + { + "auxiliary_loss_clip": 0.01146952, + "auxiliary_loss_mlp": 0.01156814, + "balance_loss_clip": 1.00242794, + "balance_loss_mlp": 1.0009836, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.7912965601899073, + "language_loss": 0.87336004, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89639771, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 6.905814170837402 + }, + { + "auxiliary_loss_clip": 0.01066399, + "auxiliary_loss_mlp": 0.01156871, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00104082, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.76286501471947, + "language_loss": 0.70872653, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73095924, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.7688825130462646 + }, + { + "auxiliary_loss_clip": 0.01114809, + "auxiliary_loss_mlp": 0.0115732, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00091743, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 1.8827909120134376, + "language_loss": 0.64168602, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66440731, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 2.7783005237579346 + }, + { + "auxiliary_loss_clip": 0.01178844, + "auxiliary_loss_mlp": 0.01157188, + "balance_loss_clip": 1.0024333, + "balance_loss_mlp": 1.00107181, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 5.735326566379488, + "language_loss": 0.79561055, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81897092, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.497821807861328 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01157044, + "balance_loss_clip": 1.0020411, + "balance_loss_mlp": 1.00092745, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 2.508927419824375, + "language_loss": 0.78987783, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81274366, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.622328042984009 + }, + { + "auxiliary_loss_clip": 0.01162322, + "auxiliary_loss_mlp": 0.0115703, + "balance_loss_clip": 1.00241458, + "balance_loss_mlp": 1.00091398, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5175545954619725, + "language_loss": 0.80967331, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83286679, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.664020538330078 + }, + { + "auxiliary_loss_clip": 0.0116302, + "auxiliary_loss_mlp": 0.0115694, + "balance_loss_clip": 1.00231743, + "balance_loss_mlp": 1.00082326, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 2.1215902660592176, + "language_loss": 0.86698496, + "learning_rate": 3.758449708105424e-06, + "loss": 0.89018452, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.535362482070923 + }, + { + "auxiliary_loss_clip": 0.01163227, + "auxiliary_loss_mlp": 0.01157305, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00090265, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 3.2510772673986605, + "language_loss": 0.77412415, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79732943, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.5087757110595703 + }, + { + "auxiliary_loss_clip": 0.01146459, + "auxiliary_loss_mlp": 0.0115651, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00077462, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.0156243989783476, + "language_loss": 0.99442869, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01745832, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.607686996459961 + }, + { + "auxiliary_loss_clip": 0.01147497, + "auxiliary_loss_mlp": 0.01156323, + "balance_loss_clip": 1.00239456, + "balance_loss_mlp": 1.00077891, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.4762172118977235, + "language_loss": 0.86487776, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88791597, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.6585640907287598 + }, + { + "auxiliary_loss_clip": 0.01178778, + "auxiliary_loss_mlp": 0.01157047, + "balance_loss_clip": 1.00239253, + "balance_loss_mlp": 1.00093067, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.9415092171539543, + "language_loss": 0.73454833, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75790656, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.497037172317505 + }, + { + "auxiliary_loss_clip": 0.0117898, + "auxiliary_loss_mlp": 0.01157514, + "balance_loss_clip": 1.00249672, + "balance_loss_mlp": 1.00111139, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.759927938064889, + "language_loss": 0.61912853, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64249343, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.561953544616699 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.01156768, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00093746, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 3.6108498888542058, + "language_loss": 0.78535765, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80804706, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.01112777, + "auxiliary_loss_mlp": 0.01157044, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00111878, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 2.4269765636423823, + "language_loss": 0.69598383, + "learning_rate": 3.757149278859014e-06, + "loss": 0.71868205, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.691873550415039 + }, + { + "auxiliary_loss_clip": 0.01161945, + "auxiliary_loss_mlp": 0.01157087, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00116169, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.8403077643584382, + "language_loss": 0.80254221, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82573259, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.5330893993377686 + }, + { + "auxiliary_loss_clip": 0.011622, + "auxiliary_loss_mlp": 0.01157476, + "balance_loss_clip": 1.00233316, + "balance_loss_mlp": 1.00097847, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.2035322719399124, + "language_loss": 0.82694304, + "learning_rate": 3.756777127858533e-06, + "loss": 0.85013974, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.522061347961426 + }, + { + "auxiliary_loss_clip": 0.01130719, + "auxiliary_loss_mlp": 0.00748662, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00044608, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.1514535940351727, + "language_loss": 0.85727328, + "learning_rate": 3.756590952429017e-06, + "loss": 0.8760671, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.654224395751953 + }, + { + "auxiliary_loss_clip": 0.011787, + "auxiliary_loss_mlp": 0.00748589, + "balance_loss_clip": 1.00232255, + "balance_loss_mlp": 1.00044465, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 6.409978222707167, + "language_loss": 0.72985363, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74912649, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.5815064907073975 + }, + { + "auxiliary_loss_clip": 0.01163314, + "auxiliary_loss_mlp": 0.01156936, + "balance_loss_clip": 1.00238907, + "balance_loss_mlp": 1.00091505, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 2.2665733136317616, + "language_loss": 0.72993875, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75314128, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.58318829536438 + }, + { + "auxiliary_loss_clip": 0.01163058, + "auxiliary_loss_mlp": 0.01157179, + "balance_loss_clip": 1.00232744, + "balance_loss_mlp": 1.00096667, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 2.3178957070314348, + "language_loss": 0.8169266, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84012896, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.55069637298584 + }, + { + "auxiliary_loss_clip": 0.01162348, + "auxiliary_loss_mlp": 0.01157566, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.00116313, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 2.007081043541708, + "language_loss": 0.72581112, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74901021, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.561035633087158 + }, + { + "auxiliary_loss_clip": 0.01162327, + "auxiliary_loss_mlp": 0.01156895, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.0010649, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.8639960476817636, + "language_loss": 0.6571939, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.68038607, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.5664467811584473 + }, + { + "auxiliary_loss_clip": 0.01162067, + "auxiliary_loss_mlp": 0.01157262, + "balance_loss_clip": 1.00221598, + "balance_loss_mlp": 1.00124133, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 1.8647560119282753, + "language_loss": 0.68617076, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70936406, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.667771577835083 + }, + { + "auxiliary_loss_clip": 0.01145526, + "auxiliary_loss_mlp": 0.01156855, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00083399, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.815617735062563, + "language_loss": 0.72998935, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75301319, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.66454815864563 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01157133, + "balance_loss_clip": 1.00233591, + "balance_loss_mlp": 1.00101638, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.7747073462732654, + "language_loss": 0.82370359, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84675008, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.5728297233581543 + }, + { + "auxiliary_loss_clip": 0.01177516, + "auxiliary_loss_mlp": 0.00747824, + "balance_loss_clip": 1.00304294, + "balance_loss_mlp": 1.00000751, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7928261105429011, + "language_loss": 0.59708428, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61633766, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 2.954568862915039 + }, + { + "auxiliary_loss_clip": 0.01145905, + "auxiliary_loss_mlp": 0.01156628, + "balance_loss_clip": 1.00239694, + "balance_loss_mlp": 1.00089288, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8589185951490284, + "language_loss": 0.76129103, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78431642, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.592130422592163 + }, + { + "auxiliary_loss_clip": 0.011629, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_clip": 1.00239563, + "balance_loss_mlp": 1.0008719, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7302299419437264, + "language_loss": 0.84875786, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87195766, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.562044858932495 + }, + { + "auxiliary_loss_clip": 0.01130708, + "auxiliary_loss_mlp": 0.01157494, + "balance_loss_clip": 1.00223243, + "balance_loss_mlp": 1.00099659, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.166572329901468, + "language_loss": 0.77507204, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79795408, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.6682188510894775 + }, + { + "auxiliary_loss_clip": 0.01130218, + "auxiliary_loss_mlp": 0.01157425, + "balance_loss_clip": 1.00233579, + "balance_loss_mlp": 1.00073624, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 4.555159220618722, + "language_loss": 0.77614266, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.7990191, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.6203293800354004 + }, + { + "auxiliary_loss_clip": 0.01163134, + "auxiliary_loss_mlp": 0.01157017, + "balance_loss_clip": 1.00226033, + "balance_loss_mlp": 1.00109136, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.90492609575942, + "language_loss": 0.86628664, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.8894881, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.5332090854644775 + }, + { + "auxiliary_loss_clip": 0.01178771, + "auxiliary_loss_mlp": 0.01156874, + "balance_loss_clip": 1.002455, + "balance_loss_mlp": 1.00113881, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.5421724958119025, + "language_loss": 0.91669768, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94005412, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.5494673252105713 + }, + { + "auxiliary_loss_clip": 0.01132373, + "auxiliary_loss_mlp": 0.01157178, + "balance_loss_clip": 1.00234795, + "balance_loss_mlp": 1.00106108, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.7768244571645806, + "language_loss": 0.65004748, + "learning_rate": 3.75360309139087e-06, + "loss": 0.672943, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.6790385246276855 + }, + { + "auxiliary_loss_clip": 0.01145456, + "auxiliary_loss_mlp": 0.01156984, + "balance_loss_clip": 1.00235093, + "balance_loss_mlp": 1.00124907, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.7583396590659854, + "language_loss": 0.7254709, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74849528, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 4.055167198181152 + }, + { + "auxiliary_loss_clip": 0.01129272, + "auxiliary_loss_mlp": 0.01157134, + "balance_loss_clip": 1.00213718, + "balance_loss_mlp": 1.00111294, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.2068496220057767, + "language_loss": 0.8033582, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.8262223, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.5606706142425537 + }, + { + "auxiliary_loss_clip": 0.01146323, + "auxiliary_loss_mlp": 0.01157253, + "balance_loss_clip": 1.00224531, + "balance_loss_mlp": 1.00123191, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.8397545219937397, + "language_loss": 0.79210532, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81514114, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.599246025085449 + }, + { + "auxiliary_loss_clip": 0.0117868, + "auxiliary_loss_mlp": 0.01156841, + "balance_loss_clip": 1.00240469, + "balance_loss_mlp": 1.00101078, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 2.0081429983769565, + "language_loss": 0.77438223, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773748, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.5380799770355225 + }, + { + "auxiliary_loss_clip": 0.01146554, + "auxiliary_loss_mlp": 0.0115662, + "balance_loss_clip": 1.00216258, + "balance_loss_mlp": 1.00098085, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.8391477499249647, + "language_loss": 0.822276, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84530771, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 4.218570709228516 + }, + { + "auxiliary_loss_clip": 0.01113933, + "auxiliary_loss_mlp": 0.01157223, + "balance_loss_clip": 1.00217152, + "balance_loss_mlp": 1.00101089, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 1.9375023647115346, + "language_loss": 0.73911315, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76182473, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 5.613133907318115 + }, + { + "auxiliary_loss_clip": 0.01145527, + "auxiliary_loss_mlp": 0.01157143, + "balance_loss_clip": 1.00207651, + "balance_loss_mlp": 1.0012176, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.0373334405855417, + "language_loss": 0.71533149, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73835814, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.613480567932129 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01156885, + "balance_loss_clip": 1.00217092, + "balance_loss_mlp": 1.00095916, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.2060497917013033, + "language_loss": 0.69395334, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71681285, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.691066026687622 + }, + { + "auxiliary_loss_clip": 0.01146795, + "auxiliary_loss_mlp": 0.01156542, + "balance_loss_clip": 1.00220239, + "balance_loss_mlp": 1.00109267, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 1.862970740968247, + "language_loss": 0.68559945, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70863283, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.7261979579925537 + }, + { + "auxiliary_loss_clip": 0.0117849, + "auxiliary_loss_mlp": 0.01156011, + "balance_loss_clip": 1.0022434, + "balance_loss_mlp": 1.00084782, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.6056312330083262, + "language_loss": 0.77967072, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80301571, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.5306007862091064 + }, + { + "auxiliary_loss_clip": 0.01178595, + "auxiliary_loss_mlp": 0.01156854, + "balance_loss_clip": 1.00226593, + "balance_loss_mlp": 1.0012145, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 3.27811814925784, + "language_loss": 0.73402917, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75738364, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.52583384513855 + }, + { + "auxiliary_loss_clip": 0.01161927, + "auxiliary_loss_mlp": 0.01156589, + "balance_loss_clip": 1.00222182, + "balance_loss_mlp": 1.00123549, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.525373111872108, + "language_loss": 0.70060253, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72378772, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.5382912158966064 + }, + { + "auxiliary_loss_clip": 0.01130691, + "auxiliary_loss_mlp": 0.01156798, + "balance_loss_clip": 1.00219941, + "balance_loss_mlp": 1.00115824, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.118765720867306, + "language_loss": 0.72741604, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.75029099, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.5946474075317383 + }, + { + "auxiliary_loss_clip": 0.01147396, + "auxiliary_loss_mlp": 0.01156525, + "balance_loss_clip": 1.00218427, + "balance_loss_mlp": 1.00098121, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.2979032791481884, + "language_loss": 0.92091352, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94395268, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.5926177501678467 + }, + { + "auxiliary_loss_clip": 0.01115121, + "auxiliary_loss_mlp": 0.01156412, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00096309, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.806457225243934, + "language_loss": 0.58149397, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.6042093, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.710481643676758 + }, + { + "auxiliary_loss_clip": 0.0114707, + "auxiliary_loss_mlp": 0.01156306, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00085688, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 2.6516633804410317, + "language_loss": 0.8159616, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.83899534, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.581446409225464 + }, + { + "auxiliary_loss_clip": 0.0109938, + "auxiliary_loss_mlp": 0.0115662, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00088489, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 3.5154854590808946, + "language_loss": 0.84030437, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86286438, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.6639833450317383 + }, + { + "auxiliary_loss_clip": 0.01145692, + "auxiliary_loss_mlp": 0.01156698, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00086784, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.2255554833365707, + "language_loss": 0.92748356, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95050746, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.511878728866577 + }, + { + "auxiliary_loss_clip": 0.01147136, + "auxiliary_loss_mlp": 0.01156492, + "balance_loss_clip": 1.00231886, + "balance_loss_mlp": 1.0010432, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 1.6706070708803429, + "language_loss": 0.77666783, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79970419, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.5565574169158936 + }, + { + "auxiliary_loss_clip": 0.01096288, + "auxiliary_loss_mlp": 0.01156471, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00121272, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 2.423678907207247, + "language_loss": 0.69836479, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72089237, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 2.9542531967163086 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01156961, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.0011301, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 2.6996034502993345, + "language_loss": 0.80929297, + "learning_rate": 3.749655694397135e-06, + "loss": 0.83216935, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.586345911026001 + }, + { + "auxiliary_loss_clip": 0.01162128, + "auxiliary_loss_mlp": 0.01156735, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.00100005, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 1.9017229744772777, + "language_loss": 0.74907362, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77226228, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.543325185775757 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01156868, + "balance_loss_clip": 1.00229716, + "balance_loss_mlp": 1.00103772, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.2466106139047683, + "language_loss": 0.65896404, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68198746, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.545793294906616 + }, + { + "auxiliary_loss_clip": 0.01178806, + "auxiliary_loss_mlp": 0.01157053, + "balance_loss_clip": 1.00236964, + "balance_loss_mlp": 1.00112677, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 6.394582926779984, + "language_loss": 0.69677377, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72013235, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.523763418197632 + }, + { + "auxiliary_loss_clip": 0.01161799, + "auxiliary_loss_mlp": 0.01157018, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00118721, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4524122591772148, + "language_loss": 0.71474886, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73793697, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.566262722015381 + }, + { + "auxiliary_loss_clip": 0.01147093, + "auxiliary_loss_mlp": 0.01156835, + "balance_loss_clip": 1.00223911, + "balance_loss_mlp": 1.00100422, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.8988484076012724, + "language_loss": 0.8002764, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82331574, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.6316890716552734 + }, + { + "auxiliary_loss_clip": 0.01129588, + "auxiliary_loss_mlp": 0.01156508, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00124955, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 1.7264273343973078, + "language_loss": 0.7682265, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79108751, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.6332590579986572 + }, + { + "auxiliary_loss_clip": 0.01162857, + "auxiliary_loss_mlp": 0.01156379, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.00092983, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.214025914914381, + "language_loss": 0.76422405, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.7874164, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.541762590408325 + }, + { + "auxiliary_loss_clip": 0.01145371, + "auxiliary_loss_mlp": 0.01156477, + "balance_loss_clip": 1.00210035, + "balance_loss_mlp": 1.00112391, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.8449704957000845, + "language_loss": 0.79169476, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81471324, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.605912923812866 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01156248, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00089431, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.960419871335627, + "language_loss": 0.85341555, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87628776, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.6367180347442627 + }, + { + "auxiliary_loss_clip": 0.01135135, + "auxiliary_loss_mlp": 0.0115667, + "balance_loss_clip": 1.00221968, + "balance_loss_mlp": 1.00103009, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.914228064801767, + "language_loss": 0.86698651, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.8899045, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.01163476, + "auxiliary_loss_mlp": 0.01156755, + "balance_loss_clip": 1.0024693, + "balance_loss_mlp": 1.00092471, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 2.13728895413937, + "language_loss": 0.78183556, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80503786, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.526132822036743 + }, + { + "auxiliary_loss_clip": 0.01162676, + "auxiliary_loss_mlp": 0.01156662, + "balance_loss_clip": 1.00223517, + "balance_loss_mlp": 1.00121355, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 2.230628194770302, + "language_loss": 0.74350363, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76669705, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.6095128059387207 + }, + { + "auxiliary_loss_clip": 0.01112423, + "auxiliary_loss_mlp": 0.01156529, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00117505, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6395900922999136, + "language_loss": 0.74348783, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76617736, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.6391868591308594 + }, + { + "auxiliary_loss_clip": 0.01161943, + "auxiliary_loss_mlp": 0.0115642, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00087571, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.55994647631276, + "language_loss": 0.84371167, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86689532, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.5642263889312744 + }, + { + "auxiliary_loss_clip": 0.01161917, + "auxiliary_loss_mlp": 0.011564, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00085616, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.7461178846364676, + "language_loss": 0.84861016, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.87179327, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.589137554168701 + }, + { + "auxiliary_loss_clip": 0.01145728, + "auxiliary_loss_mlp": 0.01156598, + "balance_loss_clip": 1.00214493, + "balance_loss_mlp": 1.00086296, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.7859305093784354, + "language_loss": 0.76689982, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78992307, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.57979679107666 + }, + { + "auxiliary_loss_clip": 0.01161983, + "auxiliary_loss_mlp": 0.01156441, + "balance_loss_clip": 1.00225401, + "balance_loss_mlp": 1.00118291, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 2.0838554503680653, + "language_loss": 0.64860141, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.67178571, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.5776543617248535 + }, + { + "auxiliary_loss_clip": 0.01163015, + "auxiliary_loss_mlp": 0.01156285, + "balance_loss_clip": 1.00229573, + "balance_loss_mlp": 1.00093162, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 1.981230790508168, + "language_loss": 0.81175667, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83494973, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 3.988312244415283 + }, + { + "auxiliary_loss_clip": 0.01113228, + "auxiliary_loss_mlp": 0.01156475, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00102639, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 1.9246855182067697, + "language_loss": 0.57362139, + "learning_rate": 3.74605902628851e-06, + "loss": 0.59631848, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.697305679321289 + }, + { + "auxiliary_loss_clip": 0.01131486, + "auxiliary_loss_mlp": 0.01157118, + "balance_loss_clip": 1.00225163, + "balance_loss_mlp": 1.00138283, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.848679647896837, + "language_loss": 0.71350145, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73638749, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.627319574356079 + }, + { + "auxiliary_loss_clip": 0.01178448, + "auxiliary_loss_mlp": 0.01155752, + "balance_loss_clip": 1.00228238, + "balance_loss_mlp": 1.00077951, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 2.1782678715312045, + "language_loss": 0.78987992, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81322193, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.4805054664611816 + }, + { + "auxiliary_loss_clip": 0.01146309, + "auxiliary_loss_mlp": 0.011566, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00134134, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 3.0192239610242684, + "language_loss": 0.84116131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86419034, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 4.153223752975464 + }, + { + "auxiliary_loss_clip": 0.01161956, + "auxiliary_loss_mlp": 0.0115613, + "balance_loss_clip": 1.00230908, + "balance_loss_mlp": 1.00096679, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.75481516497592, + "language_loss": 0.76744622, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.79062712, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 3.928025484085083 + }, + { + "auxiliary_loss_clip": 0.0117858, + "auxiliary_loss_mlp": 0.0115639, + "balance_loss_clip": 1.00226569, + "balance_loss_mlp": 1.00122738, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 2.026078248484608, + "language_loss": 0.82578802, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84913778, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 3.930429458618164 + }, + { + "auxiliary_loss_clip": 0.01146415, + "auxiliary_loss_mlp": 0.01155549, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00095868, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.8987687207198003, + "language_loss": 0.85165972, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87467939, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.648480176925659 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01156036, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.00087357, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 1.7294144719200713, + "language_loss": 0.70203519, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72474593, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 2.7550594806671143 + }, + { + "auxiliary_loss_clip": 0.01178518, + "auxiliary_loss_mlp": 0.01156091, + "balance_loss_clip": 1.00229084, + "balance_loss_mlp": 1.00111938, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.2143833358168976, + "language_loss": 0.70612437, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72947043, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.481419563293457 + }, + { + "auxiliary_loss_clip": 0.01167005, + "auxiliary_loss_mlp": 0.01155877, + "balance_loss_clip": 1.00230956, + "balance_loss_mlp": 1.00119114, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.1142828410639476, + "language_loss": 0.74055582, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76378465, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.5557918548583984 + }, + { + "auxiliary_loss_clip": 0.01178522, + "auxiliary_loss_mlp": 0.01156006, + "balance_loss_clip": 1.00223386, + "balance_loss_mlp": 1.00112903, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.8754482719672698, + "language_loss": 0.80709326, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.83043849, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.6577961444854736 + }, + { + "auxiliary_loss_clip": 0.01115075, + "auxiliary_loss_mlp": 0.01148626, + "balance_loss_clip": 1.00318575, + "balance_loss_mlp": 1.00061536, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9621839145459012, + "language_loss": 0.63599598, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65863299, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.2763826847076416 + }, + { + "auxiliary_loss_clip": 0.01145218, + "auxiliary_loss_mlp": 0.01156101, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00093818, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.623708616106759, + "language_loss": 0.81336963, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83638281, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.7167491912841797 + }, + { + "auxiliary_loss_clip": 0.01177902, + "auxiliary_loss_mlp": 0.01148029, + "balance_loss_clip": 1.00325954, + "balance_loss_mlp": 1.00078201, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7656350387786095, + "language_loss": 0.61920965, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64246893, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.161292314529419 + }, + { + "auxiliary_loss_clip": 0.01115851, + "auxiliary_loss_mlp": 0.01156207, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00075841, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 1.8970196355586342, + "language_loss": 0.71583307, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73855364, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.7737245559692383 + }, + { + "auxiliary_loss_clip": 0.01178519, + "auxiliary_loss_mlp": 0.01156219, + "balance_loss_clip": 1.00230873, + "balance_loss_mlp": 1.00086558, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 1.9085932671223018, + "language_loss": 0.85374004, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87708741, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.50628924369812 + }, + { + "auxiliary_loss_clip": 0.01130094, + "auxiliary_loss_mlp": 0.01156086, + "balance_loss_clip": 1.00226176, + "balance_loss_mlp": 1.00092363, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 2.6733937587476806, + "language_loss": 0.76820523, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.791067, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.7242345809936523 + }, + { + "auxiliary_loss_clip": 0.01130421, + "auxiliary_loss_mlp": 0.01155979, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00100732, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.7896385697451038, + "language_loss": 0.81351471, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83637869, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.6954500675201416 + }, + { + "auxiliary_loss_clip": 0.01114335, + "auxiliary_loss_mlp": 0.00748445, + "balance_loss_clip": 1.00218427, + "balance_loss_mlp": 1.00044656, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.114763055892309, + "language_loss": 0.80089074, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81951857, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.718522310256958 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01156091, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00130963, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 2.3293857465003485, + "language_loss": 0.83035994, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85322404, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.62469744682312 + }, + { + "auxiliary_loss_clip": 0.01147316, + "auxiliary_loss_mlp": 0.01156075, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.0011034, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 3.864756669486459, + "language_loss": 0.82848912, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85152304, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.6433238983154297 + }, + { + "auxiliary_loss_clip": 0.01161966, + "auxiliary_loss_mlp": 0.01155882, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00100565, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.9448945246067073, + "language_loss": 0.78819704, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81137556, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.6358821392059326 + }, + { + "auxiliary_loss_clip": 0.01151617, + "auxiliary_loss_mlp": 0.01156627, + "balance_loss_clip": 1.00258565, + "balance_loss_mlp": 1.00098765, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 2.280254777571571, + "language_loss": 0.8184697, + "learning_rate": 3.741864605462996e-06, + "loss": 0.84155214, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.6043248176574707 + }, + { + "auxiliary_loss_clip": 0.01178704, + "auxiliary_loss_mlp": 0.01156802, + "balance_loss_clip": 1.00246501, + "balance_loss_mlp": 1.00116253, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5414745580443983, + "language_loss": 0.80674994, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83010495, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.528780698776245 + }, + { + "auxiliary_loss_clip": 0.01162157, + "auxiliary_loss_mlp": 0.01156385, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00112677, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 2.154919180908013, + "language_loss": 0.63669503, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65988046, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.7069671154022217 + }, + { + "auxiliary_loss_clip": 0.01178467, + "auxiliary_loss_mlp": 0.01156504, + "balance_loss_clip": 1.00235581, + "balance_loss_mlp": 1.00095963, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 1.9110383668876312, + "language_loss": 0.7128967, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73624635, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.5148844718933105 + }, + { + "auxiliary_loss_clip": 0.0117845, + "auxiliary_loss_mlp": 0.0115615, + "balance_loss_clip": 1.00226808, + "balance_loss_mlp": 1.00098681, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 2.370427747388537, + "language_loss": 0.86544192, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.88878798, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.4960007667541504 + }, + { + "auxiliary_loss_clip": 0.01147382, + "auxiliary_loss_mlp": 0.01156338, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00098491, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.2846106417162084, + "language_loss": 0.77264154, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79567873, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.5634610652923584 + }, + { + "auxiliary_loss_clip": 0.01145067, + "auxiliary_loss_mlp": 0.01156111, + "balance_loss_clip": 1.00224125, + "balance_loss_mlp": 1.00094831, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 2.1932243425614137, + "language_loss": 0.79138207, + "learning_rate": 3.740715120924971e-06, + "loss": 0.81439388, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.6505935192108154 + }, + { + "auxiliary_loss_clip": 0.01129701, + "auxiliary_loss_mlp": 0.01156418, + "balance_loss_clip": 1.00205672, + "balance_loss_mlp": 1.00115943, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.254407997205367, + "language_loss": 0.71146971, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73433089, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.6181704998016357 + }, + { + "auxiliary_loss_clip": 0.01146322, + "auxiliary_loss_mlp": 0.01156345, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00108671, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.4865646747276307, + "language_loss": 0.73344886, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75647557, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.6367526054382324 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01155775, + "balance_loss_clip": 1.00194705, + "balance_loss_mlp": 1.00099409, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.2570612681764275, + "language_loss": 0.76076764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78363013, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.595221519470215 + }, + { + "auxiliary_loss_clip": 0.01114574, + "auxiliary_loss_mlp": 0.01156236, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00116837, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 2.036666910032683, + "language_loss": 0.78829455, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.81100261, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.6667017936706543 + }, + { + "auxiliary_loss_clip": 0.01161835, + "auxiliary_loss_mlp": 0.01156216, + "balance_loss_clip": 1.00224531, + "balance_loss_mlp": 1.00105345, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.3098857850434227, + "language_loss": 0.67108417, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69426465, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.5321450233459473 + }, + { + "auxiliary_loss_clip": 0.0113022, + "auxiliary_loss_mlp": 0.01155763, + "balance_loss_clip": 1.00201094, + "balance_loss_mlp": 1.00069523, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.0297351565993997, + "language_loss": 0.76195669, + "learning_rate": 3.739563260095902e-06, + "loss": 0.7848165, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.6325955390930176 + }, + { + "auxiliary_loss_clip": 0.01150206, + "auxiliary_loss_mlp": 0.01155478, + "balance_loss_clip": 1.0024457, + "balance_loss_mlp": 1.0011735, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 1.981724268447854, + "language_loss": 0.80435121, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82740808, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.5486481189727783 + }, + { + "auxiliary_loss_clip": 0.01163194, + "auxiliary_loss_mlp": 0.01156292, + "balance_loss_clip": 1.0023942, + "balance_loss_mlp": 1.00122404, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.0871647005243976, + "language_loss": 0.84975469, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87294948, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 4.039062976837158 + }, + { + "auxiliary_loss_clip": 0.01128521, + "auxiliary_loss_mlp": 0.01156034, + "balance_loss_clip": 1.00203121, + "balance_loss_mlp": 1.00115705, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.8110363979727777, + "language_loss": 0.74500257, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76784813, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.6872124671936035 + }, + { + "auxiliary_loss_clip": 0.01134847, + "auxiliary_loss_mlp": 0.01156229, + "balance_loss_clip": 1.00240326, + "balance_loss_mlp": 1.00106597, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 2.2109685189966948, + "language_loss": 0.75747693, + "learning_rate": 3.738794033491209e-06, + "loss": 0.7803877, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.661008596420288 + }, + { + "auxiliary_loss_clip": 0.01178379, + "auxiliary_loss_mlp": 0.01155851, + "balance_loss_clip": 1.00230372, + "balance_loss_mlp": 1.00097442, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 2.5205137997758054, + "language_loss": 0.79281747, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81615973, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.4949851036071777 + }, + { + "auxiliary_loss_clip": 0.01130154, + "auxiliary_loss_mlp": 0.01156679, + "balance_loss_clip": 1.00214911, + "balance_loss_mlp": 1.00122976, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 3.758425202917768, + "language_loss": 0.72710454, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74997282, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 4.046480178833008 + }, + { + "auxiliary_loss_clip": 0.01144762, + "auxiliary_loss_mlp": 0.01155435, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00093985, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 3.1026122993242593, + "language_loss": 0.74114549, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76414752, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.619109630584717 + }, + { + "auxiliary_loss_clip": 0.01178451, + "auxiliary_loss_mlp": 0.0115581, + "balance_loss_clip": 1.00232983, + "balance_loss_mlp": 1.00112379, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.7698208753619746, + "language_loss": 0.68309933, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.706442, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 5.429798603057861 + }, + { + "auxiliary_loss_clip": 0.01129241, + "auxiliary_loss_mlp": 0.01156363, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00119996, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 2.106731772362487, + "language_loss": 0.80033439, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82319045, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.6510047912597656 + }, + { + "auxiliary_loss_clip": 0.01178591, + "auxiliary_loss_mlp": 0.0115661, + "balance_loss_clip": 1.00236452, + "balance_loss_mlp": 1.00106597, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 2.0052822921142623, + "language_loss": 0.71814108, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74149311, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.555192232131958 + }, + { + "auxiliary_loss_clip": 0.01161872, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_clip": 1.00229967, + "balance_loss_mlp": 1.00115132, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.9761411477407296, + "language_loss": 0.85139662, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87458038, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 2.5382556915283203 + }, + { + "auxiliary_loss_clip": 0.01150002, + "auxiliary_loss_mlp": 0.01155933, + "balance_loss_clip": 1.00217378, + "balance_loss_mlp": 1.00124717, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 2.064816855428662, + "language_loss": 0.73548508, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75854445, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.6496458053588867 + }, + { + "auxiliary_loss_clip": 0.0116267, + "auxiliary_loss_mlp": 0.01155465, + "balance_loss_clip": 1.00229454, + "balance_loss_mlp": 1.00096989, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.524778491047824, + "language_loss": 0.80872452, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83190584, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.691213607788086 + }, + { + "auxiliary_loss_clip": 0.01178407, + "auxiliary_loss_mlp": 0.01155999, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.00083566, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.870849392985733, + "language_loss": 0.74912536, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77246952, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.5065977573394775 + }, + { + "auxiliary_loss_clip": 0.01112637, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.00102663, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.4817452565021505, + "language_loss": 0.74252582, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76520836, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.8245928287506104 + }, + { + "auxiliary_loss_clip": 0.0116188, + "auxiliary_loss_mlp": 0.01156117, + "balance_loss_clip": 1.00235069, + "balance_loss_mlp": 1.00085855, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.4969535703544075, + "language_loss": 0.67025268, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69343269, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 2.9317588806152344 + }, + { + "auxiliary_loss_clip": 0.01162597, + "auxiliary_loss_mlp": 0.01156127, + "balance_loss_clip": 1.00219321, + "balance_loss_mlp": 1.00105965, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.8187045552069327, + "language_loss": 0.74103975, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76422703, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.5129270553588867 + }, + { + "auxiliary_loss_clip": 0.01132746, + "auxiliary_loss_mlp": 0.01146016, + "balance_loss_clip": 1.00264478, + "balance_loss_mlp": 1.00029492, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8275858625615635, + "language_loss": 0.50406456, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52685219, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.23750901222229 + }, + { + "auxiliary_loss_clip": 0.01161736, + "auxiliary_loss_mlp": 0.01154958, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00093973, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.8823927399213465, + "language_loss": 0.74598783, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76915479, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.724085807800293 + }, + { + "auxiliary_loss_clip": 0.01114029, + "auxiliary_loss_mlp": 0.01147768, + "balance_loss_clip": 1.00270331, + "balance_loss_mlp": 1.00052094, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8578526366131115, + "language_loss": 0.60114157, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62375957, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.177678108215332 + }, + { + "auxiliary_loss_clip": 0.01096882, + "auxiliary_loss_mlp": 0.01156314, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00134206, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.706727818000284, + "language_loss": 0.78345692, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80598885, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.8550801277160645 + }, + { + "auxiliary_loss_clip": 0.01161565, + "auxiliary_loss_mlp": 0.01155403, + "balance_loss_clip": 1.00218165, + "balance_loss_mlp": 1.00109839, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.901175642138565, + "language_loss": 0.78512752, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80829716, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.552990436553955 + }, + { + "auxiliary_loss_clip": 0.01178392, + "auxiliary_loss_mlp": 0.01156014, + "balance_loss_clip": 1.00230157, + "balance_loss_mlp": 1.00085092, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 1.9057226352234211, + "language_loss": 0.78432596, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80767, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.575747013092041 + }, + { + "auxiliary_loss_clip": 0.01162311, + "auxiliary_loss_mlp": 0.01155751, + "balance_loss_clip": 1.00225711, + "balance_loss_mlp": 1.00125611, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 2.0318867436783457, + "language_loss": 0.80441636, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82759702, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.524492025375366 + }, + { + "auxiliary_loss_clip": 0.01134749, + "auxiliary_loss_mlp": 0.00748519, + "balance_loss_clip": 1.00229836, + "balance_loss_mlp": 1.00048816, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9954479472140267, + "language_loss": 0.7862227, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80505538, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.6998090744018555 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01155378, + "balance_loss_clip": 1.00213778, + "balance_loss_mlp": 1.00078702, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.709192832793725, + "language_loss": 0.81146151, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83415014, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.643373727798462 + }, + { + "auxiliary_loss_clip": 0.01098351, + "auxiliary_loss_mlp": 0.0115614, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00097728, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.1529934655620084, + "language_loss": 0.85445988, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87700486, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.645789861679077 + }, + { + "auxiliary_loss_clip": 0.01150612, + "auxiliary_loss_mlp": 0.01156437, + "balance_loss_clip": 1.00236726, + "balance_loss_mlp": 1.00127399, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 1.950454477334441, + "language_loss": 0.81310278, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.8361733, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.601130247116089 + }, + { + "auxiliary_loss_clip": 0.01147056, + "auxiliary_loss_mlp": 0.01155563, + "balance_loss_clip": 1.00222647, + "balance_loss_mlp": 1.00078154, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 6.2224359043725315, + "language_loss": 0.75271481, + "learning_rate": 3.73396248424356e-06, + "loss": 0.77574098, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.576474905014038 + }, + { + "auxiliary_loss_clip": 0.011614, + "auxiliary_loss_mlp": 0.01155074, + "balance_loss_clip": 1.00210905, + "balance_loss_mlp": 1.00086474, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.622323154387911, + "language_loss": 0.81524372, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83840835, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.5447559356689453 + }, + { + "auxiliary_loss_clip": 0.01161598, + "auxiliary_loss_mlp": 0.01155454, + "balance_loss_clip": 1.00225663, + "balance_loss_mlp": 1.00105405, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.455648272772593, + "language_loss": 0.79939508, + "learning_rate": 3.733574183478691e-06, + "loss": 0.82256556, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.5050580501556396 + }, + { + "auxiliary_loss_clip": 0.01150106, + "auxiliary_loss_mlp": 0.01155123, + "balance_loss_clip": 1.0023526, + "balance_loss_mlp": 1.00091374, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.069814181304471, + "language_loss": 0.79228055, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81533289, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.542964220046997 + }, + { + "auxiliary_loss_clip": 0.01161449, + "auxiliary_loss_mlp": 0.01155432, + "balance_loss_clip": 1.00212681, + "balance_loss_mlp": 1.00103271, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 2.6153869828980065, + "language_loss": 0.74072111, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76388985, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.546379804611206 + }, + { + "auxiliary_loss_clip": 0.01129982, + "auxiliary_loss_mlp": 0.01155438, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00103831, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8582504935494064, + "language_loss": 0.65118837, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67404258, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.588076114654541 + }, + { + "auxiliary_loss_clip": 0.01145853, + "auxiliary_loss_mlp": 0.01155882, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00119615, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.556190605011366, + "language_loss": 0.73697352, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75999093, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.6358869075775146 + }, + { + "auxiliary_loss_clip": 0.01146217, + "auxiliary_loss_mlp": 0.01155789, + "balance_loss_clip": 1.00214875, + "balance_loss_mlp": 1.00091231, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 1.6851050364832756, + "language_loss": 0.87921154, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90223157, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.5807158946990967 + }, + { + "auxiliary_loss_clip": 0.01178199, + "auxiliary_loss_mlp": 0.01155369, + "balance_loss_clip": 1.00226808, + "balance_loss_mlp": 1.00096869, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.0879890967943933, + "language_loss": 0.726717, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75005269, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.5204861164093018 + }, + { + "auxiliary_loss_clip": 0.01145235, + "auxiliary_loss_mlp": 0.0115573, + "balance_loss_clip": 1.00213718, + "balance_loss_mlp": 1.00113964, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.063435827260816, + "language_loss": 0.83356977, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85657942, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.6119894981384277 + }, + { + "auxiliary_loss_clip": 0.01161691, + "auxiliary_loss_mlp": 0.01145971, + "balance_loss_clip": 1.00276351, + "balance_loss_mlp": 1.00024986, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8594066116956203, + "language_loss": 0.55807889, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58115554, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.1529951095581055 + }, + { + "auxiliary_loss_clip": 0.01161759, + "auxiliary_loss_mlp": 0.01155494, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00109386, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.6510456918116323, + "language_loss": 0.6998961, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72306865, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 3.9842329025268555 + }, + { + "auxiliary_loss_clip": 0.0114468, + "auxiliary_loss_mlp": 0.01155061, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00085187, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 1.825107187620933, + "language_loss": 0.74294585, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76594329, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.585071086883545 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.0115557, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00117016, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 2.1347335667150724, + "language_loss": 0.84187746, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86473393, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.5906808376312256 + }, + { + "auxiliary_loss_clip": 0.0114467, + "auxiliary_loss_mlp": 0.01154882, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00086331, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7756364828371844, + "language_loss": 0.89983052, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92282599, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.581360340118408 + }, + { + "auxiliary_loss_clip": 0.01129093, + "auxiliary_loss_mlp": 0.01155467, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00097179, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.6643452198146422, + "language_loss": 0.74693638, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.76978201, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 4.071459054946899 + }, + { + "auxiliary_loss_clip": 0.01145949, + "auxiliary_loss_mlp": 0.00748628, + "balance_loss_clip": 1.00203311, + "balance_loss_mlp": 1.00062943, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7219608784228804, + "language_loss": 0.7526167, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77156246, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 4.012154817581177 + }, + { + "auxiliary_loss_clip": 0.01160122, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_clip": 1.00289249, + "balance_loss_mlp": 1.00033271, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7815532083768653, + "language_loss": 0.68463898, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70770073, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 4.440356254577637 + }, + { + "auxiliary_loss_clip": 0.0112973, + "auxiliary_loss_mlp": 0.01155356, + "balance_loss_clip": 1.00198591, + "balance_loss_mlp": 1.00086069, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.1626321077502095, + "language_loss": 0.73083347, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75368434, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.619882822036743 + }, + { + "auxiliary_loss_clip": 0.01144837, + "auxiliary_loss_mlp": 0.01155234, + "balance_loss_clip": 1.00227642, + "balance_loss_mlp": 1.00102472, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 5.173595085410965, + "language_loss": 0.83786619, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.86086696, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.564147710800171 + }, + { + "auxiliary_loss_clip": 0.01097594, + "auxiliary_loss_mlp": 0.01155428, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00093269, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.001316855262021, + "language_loss": 0.801507, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82403719, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.706256866455078 + }, + { + "auxiliary_loss_clip": 0.01144349, + "auxiliary_loss_mlp": 0.01155767, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00098586, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.0259254901121846, + "language_loss": 0.78349882, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80649996, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.6112778186798096 + }, + { + "auxiliary_loss_clip": 0.01135055, + "auxiliary_loss_mlp": 0.01155373, + "balance_loss_clip": 1.00223267, + "balance_loss_mlp": 1.00116372, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.1916001715144295, + "language_loss": 0.8381148, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.86101907, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.5886008739471436 + }, + { + "auxiliary_loss_clip": 0.01178151, + "auxiliary_loss_mlp": 0.01155664, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00135994, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.6730235225468895, + "language_loss": 0.79033315, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81367135, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.47123122215271 + }, + { + "auxiliary_loss_clip": 0.01113276, + "auxiliary_loss_mlp": 0.01154895, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00097167, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.080273867866729, + "language_loss": 0.69726753, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71994925, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.647709608078003 + }, + { + "auxiliary_loss_clip": 0.01145348, + "auxiliary_loss_mlp": 0.01154081, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.0007298, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7078327584520416, + "language_loss": 0.91158098, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93457526, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.591992139816284 + }, + { + "auxiliary_loss_clip": 0.01162649, + "auxiliary_loss_mlp": 0.01155651, + "balance_loss_clip": 1.00226331, + "balance_loss_mlp": 1.00096464, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.0972910143164647, + "language_loss": 0.81388497, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83706796, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.522353410720825 + }, + { + "auxiliary_loss_clip": 0.01128619, + "auxiliary_loss_mlp": 0.01155018, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.0009042, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 1.9621953031011876, + "language_loss": 0.7587387, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78157508, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.6362576484680176 + }, + { + "auxiliary_loss_clip": 0.01145115, + "auxiliary_loss_mlp": 0.01155423, + "balance_loss_clip": 1.00210738, + "balance_loss_mlp": 1.00121367, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 25.300663741819633, + "language_loss": 0.83077919, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85378456, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.618746757507324 + }, + { + "auxiliary_loss_clip": 0.01149489, + "auxiliary_loss_mlp": 0.01146213, + "balance_loss_clip": 1.00298429, + "balance_loss_mlp": 1.00049114, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8447505391639207, + "language_loss": 0.60530019, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62825716, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 3.003993034362793 + }, + { + "auxiliary_loss_clip": 0.01128758, + "auxiliary_loss_mlp": 0.01154954, + "balance_loss_clip": 1.00204122, + "balance_loss_mlp": 1.00112629, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.275893435595178, + "language_loss": 0.75481713, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77765429, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.5884156227111816 + }, + { + "auxiliary_loss_clip": 0.01161408, + "auxiliary_loss_mlp": 0.00748476, + "balance_loss_clip": 1.00211132, + "balance_loss_mlp": 1.00044656, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 3.9551703252028467, + "language_loss": 0.60815346, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62725228, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.53536319732666 + }, + { + "auxiliary_loss_clip": 0.01178169, + "auxiliary_loss_mlp": 0.01155236, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00121748, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.3170644716324147, + "language_loss": 0.80744803, + "learning_rate": 3.727718151176243e-06, + "loss": 0.83078212, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.6709976196289062 + }, + { + "auxiliary_loss_clip": 0.01129765, + "auxiliary_loss_mlp": 0.01154816, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.00098848, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.3231117968900103, + "language_loss": 0.83038414, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85322988, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.5821688175201416 + }, + { + "auxiliary_loss_clip": 0.011772, + "auxiliary_loss_mlp": 0.01146582, + "balance_loss_clip": 1.00288248, + "balance_loss_mlp": 1.00086105, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9664354694553694, + "language_loss": 0.63675886, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65999669, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.9261159896850586 + }, + { + "auxiliary_loss_clip": 0.01145443, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.0009495, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.572448917138908, + "language_loss": 0.76555443, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78855383, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.5709900856018066 + }, + { + "auxiliary_loss_clip": 0.01128953, + "auxiliary_loss_mlp": 0.01154989, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00097084, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.064785314066033, + "language_loss": 0.71390688, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73674631, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.5667481422424316 + }, + { + "auxiliary_loss_clip": 0.0117787, + "auxiliary_loss_mlp": 0.01154708, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00097537, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.0372471338989304, + "language_loss": 0.74978304, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77310884, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.4619157314300537 + }, + { + "auxiliary_loss_clip": 0.0112717, + "auxiliary_loss_mlp": 0.01154472, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00102615, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 1.9012022841575607, + "language_loss": 0.883057, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90587342, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.6264808177948 + }, + { + "auxiliary_loss_clip": 0.01177892, + "auxiliary_loss_mlp": 0.01154679, + "balance_loss_clip": 1.00214088, + "balance_loss_mlp": 1.0012331, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.509878392013205, + "language_loss": 0.79886997, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82219565, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.4960105419158936 + }, + { + "auxiliary_loss_clip": 0.01146201, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00088859, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.478632983963832, + "language_loss": 0.62488174, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.64789379, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.595193862915039 + }, + { + "auxiliary_loss_clip": 0.01178116, + "auxiliary_loss_mlp": 0.011546, + "balance_loss_clip": 1.0023303, + "balance_loss_mlp": 1.00086808, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.7407283594844156, + "language_loss": 0.80084711, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82417428, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.4902477264404297 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01154413, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00096726, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.3991562885889786, + "language_loss": 0.85707009, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87975097, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.6512022018432617 + }, + { + "auxiliary_loss_clip": 0.01177657, + "auxiliary_loss_mlp": 0.01153389, + "balance_loss_clip": 1.00215626, + "balance_loss_mlp": 1.00061023, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.0642972429663713, + "language_loss": 0.84432584, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86763626, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.550823926925659 + }, + { + "auxiliary_loss_clip": 0.01161856, + "auxiliary_loss_mlp": 0.011541, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00094008, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.1405646737104256, + "language_loss": 0.86114073, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88430035, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.5563502311706543 + }, + { + "auxiliary_loss_clip": 0.01081101, + "auxiliary_loss_mlp": 0.0115413, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00077891, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 2.824048135284701, + "language_loss": 0.78214264, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80449486, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.824894666671753 + }, + { + "auxiliary_loss_clip": 0.01162014, + "auxiliary_loss_mlp": 0.01154372, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00092602, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.9379686978570998, + "language_loss": 0.75638676, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77955061, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.6195831298828125 + }, + { + "auxiliary_loss_clip": 0.01130197, + "auxiliary_loss_mlp": 0.01154393, + "balance_loss_clip": 1.00200677, + "balance_loss_mlp": 1.000947, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.065534977553747, + "language_loss": 0.70978087, + "learning_rate": 3.7247680111229e-06, + "loss": 0.7326268, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.8311915397644043 + }, + { + "auxiliary_loss_clip": 0.01134346, + "auxiliary_loss_mlp": 0.01153953, + "balance_loss_clip": 1.00288332, + "balance_loss_mlp": 1.00069737, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.2333043442515335, + "language_loss": 0.69206625, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71494925, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 4.070679187774658 + }, + { + "auxiliary_loss_clip": 0.01129714, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00094938, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6356757720712471, + "language_loss": 0.76238447, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78522646, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.6245062351226807 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01154146, + "balance_loss_clip": 1.00175846, + "balance_loss_mlp": 1.00089025, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8924599138804243, + "language_loss": 0.69606256, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71889156, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.567380428314209 + }, + { + "auxiliary_loss_clip": 0.01161935, + "auxiliary_loss_mlp": 0.01153925, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00086021, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 2.151430234252215, + "language_loss": 0.74078673, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76394534, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01128628, + "auxiliary_loss_mlp": 0.01153947, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00078726, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 2.0478914161416033, + "language_loss": 0.65906763, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.68189335, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.5975024700164795 + }, + { + "auxiliary_loss_clip": 0.0114429, + "auxiliary_loss_mlp": 0.00748301, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00042033, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 2.263073962713955, + "language_loss": 0.82057458, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83950043, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 4.0027101039886475 + }, + { + "auxiliary_loss_clip": 0.01145704, + "auxiliary_loss_mlp": 0.01154474, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00083756, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.895037025222532, + "language_loss": 0.86955214, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89255393, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 3.970888376235962 + }, + { + "auxiliary_loss_clip": 0.01118794, + "auxiliary_loss_mlp": 0.01154649, + "balance_loss_clip": 1.00279236, + "balance_loss_mlp": 1.00101161, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5506270586579636, + "language_loss": 0.85191566, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87465012, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 4.026398420333862 + }, + { + "auxiliary_loss_clip": 0.01161539, + "auxiliary_loss_mlp": 0.01154633, + "balance_loss_clip": 1.00212288, + "balance_loss_mlp": 1.00090051, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6879033584803145, + "language_loss": 0.89261711, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91577882, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.5682294368743896 + }, + { + "auxiliary_loss_clip": 0.01162575, + "auxiliary_loss_mlp": 0.01154337, + "balance_loss_clip": 1.00211358, + "balance_loss_mlp": 1.00089121, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 5.1446544878066, + "language_loss": 0.78276879, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80593789, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.499323606491089 + }, + { + "auxiliary_loss_clip": 0.0114492, + "auxiliary_loss_mlp": 0.0115415, + "balance_loss_clip": 1.00214708, + "balance_loss_mlp": 1.00089431, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.616213851259741, + "language_loss": 0.79164678, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81463748, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.5491061210632324 + }, + { + "auxiliary_loss_clip": 0.01177736, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_clip": 1.00216031, + "balance_loss_mlp": 1.00077629, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.6148675819849796, + "language_loss": 0.75631762, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77963722, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.6036460399627686 + }, + { + "auxiliary_loss_clip": 0.01177944, + "auxiliary_loss_mlp": 0.01154389, + "balance_loss_clip": 1.00224304, + "balance_loss_mlp": 1.00094306, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.5075731131186854, + "language_loss": 0.74940389, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77272719, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.5436227321624756 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01153858, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00088882, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.9602678159546867, + "language_loss": 0.73720419, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75987118, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.6578257083892822 + }, + { + "auxiliary_loss_clip": 0.01145375, + "auxiliary_loss_mlp": 0.01153277, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00068891, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.8253782654291208, + "language_loss": 0.73702312, + "learning_rate": 3.721803155320412e-06, + "loss": 0.76000965, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.5776233673095703 + }, + { + "auxiliary_loss_clip": 0.01144148, + "auxiliary_loss_mlp": 0.01154128, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00087237, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 3.6311184383615482, + "language_loss": 0.66557759, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68856031, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.59639048576355 + }, + { + "auxiliary_loss_clip": 0.01151247, + "auxiliary_loss_mlp": 0.01154396, + "balance_loss_clip": 1.00269401, + "balance_loss_mlp": 1.00095034, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.3200972462752067, + "language_loss": 0.82791364, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85097015, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.581981897354126 + }, + { + "auxiliary_loss_clip": 0.01176996, + "auxiliary_loss_mlp": 0.01145733, + "balance_loss_clip": 1.00290442, + "balance_loss_mlp": 1.00077462, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8290245434789097, + "language_loss": 0.57474315, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59797037, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.104706287384033 + }, + { + "auxiliary_loss_clip": 0.01162166, + "auxiliary_loss_mlp": 0.01154145, + "balance_loss_clip": 1.00211561, + "balance_loss_mlp": 1.00088906, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 1.8776263875614811, + "language_loss": 0.84115261, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.86431575, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.5405702590942383 + }, + { + "auxiliary_loss_clip": 0.01161291, + "auxiliary_loss_mlp": 0.01154558, + "balance_loss_clip": 1.00224602, + "balance_loss_mlp": 1.00120759, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8676827526671915, + "language_loss": 0.77025652, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79341507, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.549006700515747 + }, + { + "auxiliary_loss_clip": 0.01161254, + "auxiliary_loss_mlp": 0.0115435, + "balance_loss_clip": 1.00209582, + "balance_loss_mlp": 1.00099897, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 3.102092428229589, + "language_loss": 0.84132671, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86448276, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.555145502090454 + }, + { + "auxiliary_loss_clip": 0.01161121, + "auxiliary_loss_mlp": 0.00748529, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00055063, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.101678220648997, + "language_loss": 0.76356506, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78266156, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.5198137760162354 + }, + { + "auxiliary_loss_clip": 0.01129006, + "auxiliary_loss_mlp": 0.01154788, + "balance_loss_clip": 1.00216889, + "balance_loss_mlp": 1.00124621, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.5537531676826608, + "language_loss": 0.75584888, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77868676, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.670266628265381 + }, + { + "auxiliary_loss_clip": 0.01177798, + "auxiliary_loss_mlp": 0.01154478, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00084114, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 2.0676100019895265, + "language_loss": 0.78275692, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80607963, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.5006046295166016 + }, + { + "auxiliary_loss_clip": 0.01162024, + "auxiliary_loss_mlp": 0.01154079, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.00082374, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.8254710205274116, + "language_loss": 0.73219168, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75535274, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.539849042892456 + }, + { + "auxiliary_loss_clip": 0.01118683, + "auxiliary_loss_mlp": 0.01154085, + "balance_loss_clip": 1.00271416, + "balance_loss_mlp": 1.00092506, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.967003981677554, + "language_loss": 0.79657197, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81929958, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.6424508094787598 + }, + { + "auxiliary_loss_clip": 0.01177874, + "auxiliary_loss_mlp": 0.01154604, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00106239, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.0067878135886845, + "language_loss": 0.84080434, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86412919, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.4576916694641113 + }, + { + "auxiliary_loss_clip": 0.01162354, + "auxiliary_loss_mlp": 0.01154709, + "balance_loss_clip": 1.00218081, + "balance_loss_mlp": 1.00088143, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 2.161363176045156, + "language_loss": 0.7358948, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75906545, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.61616587638855 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00096774, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 2.382163514035813, + "language_loss": 0.7648108, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.7874943, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 2.6600048542022705 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01145214, + "balance_loss_clip": 1.00302315, + "balance_loss_mlp": 1.00025547, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7599124565082805, + "language_loss": 0.55302882, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57560736, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.1983683109283447 + }, + { + "auxiliary_loss_clip": 0.01144308, + "auxiliary_loss_mlp": 0.01154808, + "balance_loss_clip": 1.00205386, + "balance_loss_mlp": 1.00098026, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.485538176922667, + "language_loss": 0.70728868, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7302798, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.5318779945373535 + }, + { + "auxiliary_loss_clip": 0.01177875, + "auxiliary_loss_mlp": 0.01154199, + "balance_loss_clip": 1.00223231, + "balance_loss_mlp": 1.00094414, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.075767728180654, + "language_loss": 0.80384386, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82716459, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.488572120666504 + }, + { + "auxiliary_loss_clip": 0.01111466, + "auxiliary_loss_mlp": 0.0115455, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00100803, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 2.0166048654898736, + "language_loss": 0.75051445, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77317458, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.682715654373169 + }, + { + "auxiliary_loss_clip": 0.01097086, + "auxiliary_loss_mlp": 0.01154764, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00093579, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 2.677178350134247, + "language_loss": 0.73843229, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76095074, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.7230844497680664 + }, + { + "auxiliary_loss_clip": 0.01144555, + "auxiliary_loss_mlp": 0.01155103, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.0009892, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.775665150797031, + "language_loss": 0.76748395, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79048049, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.5446765422821045 + }, + { + "auxiliary_loss_clip": 0.01162538, + "auxiliary_loss_mlp": 0.01154892, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00106466, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 1.9899164445475983, + "language_loss": 0.81911922, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.8422935, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.545189380645752 + }, + { + "auxiliary_loss_clip": 0.01129018, + "auxiliary_loss_mlp": 0.01154998, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00107503, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.7974026560721548, + "language_loss": 0.76721364, + "learning_rate": 3.717428133894807e-06, + "loss": 0.79005384, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.670757532119751 + }, + { + "auxiliary_loss_clip": 0.01161557, + "auxiliary_loss_mlp": 0.01154587, + "balance_loss_clip": 1.00230277, + "balance_loss_mlp": 1.00123608, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.5932762804830374, + "language_loss": 0.86379403, + "learning_rate": 3.71722851973837e-06, + "loss": 0.8869555, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.608358860015869 + }, + { + "auxiliary_loss_clip": 0.01144212, + "auxiliary_loss_mlp": 0.01154784, + "balance_loss_clip": 1.0019995, + "balance_loss_mlp": 1.00105155, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5472330059797157, + "language_loss": 0.73546672, + "learning_rate": 3.717028840464455e-06, + "loss": 0.75845665, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 4.127583026885986 + }, + { + "auxiliary_loss_clip": 0.01161284, + "auxiliary_loss_mlp": 0.01154524, + "balance_loss_clip": 1.00232482, + "balance_loss_mlp": 1.00107741, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 1.9138701062575596, + "language_loss": 0.78469628, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80785435, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.515035629272461 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01145527, + "balance_loss_clip": 1.00213969, + "balance_loss_mlp": 1.00056887, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7867169337616012, + "language_loss": 0.5344789, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55703795, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.2528610229492188 + }, + { + "auxiliary_loss_clip": 0.01144882, + "auxiliary_loss_mlp": 0.00748603, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00072026, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9083865945357719, + "language_loss": 0.80556548, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82450032, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.585753917694092 + }, + { + "auxiliary_loss_clip": 0.01145948, + "auxiliary_loss_mlp": 0.0115428, + "balance_loss_clip": 1.00212741, + "balance_loss_mlp": 1.00083339, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 1.9833850030510047, + "language_loss": 0.86523604, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88823831, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 3.96871018409729 + }, + { + "auxiliary_loss_clip": 0.01112616, + "auxiliary_loss_mlp": 0.01154334, + "balance_loss_clip": 1.00202358, + "balance_loss_mlp": 1.00088811, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.6372845278658166, + "language_loss": 0.69067383, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71334338, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 4.0251970291137695 + }, + { + "auxiliary_loss_clip": 0.01130486, + "auxiliary_loss_mlp": 0.0115505, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00131798, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.9530427366682783, + "language_loss": 0.80511785, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82797319, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 4.0446083545684814 + }, + { + "auxiliary_loss_clip": 0.01161243, + "auxiliary_loss_mlp": 0.01154265, + "balance_loss_clip": 1.0021534, + "balance_loss_mlp": 1.00100923, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 2.1040906038350697, + "language_loss": 0.83870447, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86185956, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.5194966793060303 + }, + { + "auxiliary_loss_clip": 0.01160648, + "auxiliary_loss_mlp": 0.01154768, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00113094, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.574535773471461, + "language_loss": 0.80166399, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82481825, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.546051025390625 + }, + { + "auxiliary_loss_clip": 0.01146642, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.0009234, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.6899869153205522, + "language_loss": 0.81072569, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83373868, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.5786139965057373 + }, + { + "auxiliary_loss_clip": 0.01161153, + "auxiliary_loss_mlp": 0.01154739, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00110185, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.6751223523093033, + "language_loss": 0.77487099, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.7980299, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.5625603199005127 + }, + { + "auxiliary_loss_clip": 0.01161407, + "auxiliary_loss_mlp": 0.01155277, + "balance_loss_clip": 1.00229037, + "balance_loss_mlp": 1.00097287, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.6051637302768422, + "language_loss": 0.81648433, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83965123, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.5129194259643555 + }, + { + "auxiliary_loss_clip": 0.01129091, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_clip": 1.0020231, + "balance_loss_mlp": 1.00084555, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.7915148641181933, + "language_loss": 0.80998552, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83281934, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.584721803665161 + }, + { + "auxiliary_loss_clip": 0.01161337, + "auxiliary_loss_mlp": 0.01154607, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.00096977, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.5492932657489056, + "language_loss": 0.89587677, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91903627, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.535781145095825 + }, + { + "auxiliary_loss_clip": 0.01111016, + "auxiliary_loss_mlp": 0.01154553, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00091577, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.392315676542616, + "language_loss": 0.62199664, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64465237, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.664414167404175 + }, + { + "auxiliary_loss_clip": 0.01129036, + "auxiliary_loss_mlp": 0.01154748, + "balance_loss_clip": 1.00218368, + "balance_loss_mlp": 1.00111151, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.1822713548138895, + "language_loss": 0.73903131, + "learning_rate": 3.714025842413166e-06, + "loss": 0.76186919, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.6299233436584473 + }, + { + "auxiliary_loss_clip": 0.0116224, + "auxiliary_loss_mlp": 0.01154059, + "balance_loss_clip": 1.00214493, + "balance_loss_mlp": 1.00089931, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.771698096413929, + "language_loss": 0.82654279, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84970582, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.5712132453918457 + }, + { + "auxiliary_loss_clip": 0.01128681, + "auxiliary_loss_mlp": 0.01154711, + "balance_loss_clip": 1.00204217, + "balance_loss_mlp": 1.00107455, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.8395170610057452, + "language_loss": 0.77802563, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80085957, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.6182143688201904 + }, + { + "auxiliary_loss_clip": 0.01145914, + "auxiliary_loss_mlp": 0.0115438, + "balance_loss_clip": 1.00237846, + "balance_loss_mlp": 1.00112486, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.571842154626176, + "language_loss": 0.79631066, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81931365, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.58478045463562 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.01154838, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00091457, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 1.9400200229814848, + "language_loss": 0.71872902, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74140555, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.678328037261963 + }, + { + "auxiliary_loss_clip": 0.0116157, + "auxiliary_loss_mlp": 0.01154825, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.00099778, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.9743831395882832, + "language_loss": 0.78744113, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81060505, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.5319461822509766 + }, + { + "auxiliary_loss_clip": 0.01145837, + "auxiliary_loss_mlp": 0.00748568, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00080848, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 5.3871938933143735, + "language_loss": 0.86140287, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88034689, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.591186761856079 + }, + { + "auxiliary_loss_clip": 0.01134783, + "auxiliary_loss_mlp": 0.0115451, + "balance_loss_clip": 1.0026114, + "balance_loss_mlp": 1.00106359, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.113747855961664, + "language_loss": 0.8891046, + "learning_rate": 3.712619437068174e-06, + "loss": 0.91199756, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.6314125061035156 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01154857, + "balance_loss_clip": 1.00237298, + "balance_loss_mlp": 1.00112486, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2267203547658423, + "language_loss": 0.78172517, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80456567, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.623683452606201 + }, + { + "auxiliary_loss_clip": 0.01144848, + "auxiliary_loss_mlp": 0.0115516, + "balance_loss_clip": 1.00216806, + "balance_loss_mlp": 1.00095105, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 1.8680590386447082, + "language_loss": 0.81403923, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83703929, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.5595450401306152 + }, + { + "auxiliary_loss_clip": 0.01162337, + "auxiliary_loss_mlp": 0.01154314, + "balance_loss_clip": 1.00237942, + "balance_loss_mlp": 1.00105822, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 2.3013236708333813, + "language_loss": 0.73072064, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75388706, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.52974271774292 + }, + { + "auxiliary_loss_clip": 0.011447, + "auxiliary_loss_mlp": 0.01154395, + "balance_loss_clip": 1.00215614, + "balance_loss_mlp": 1.00085402, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.1695537540506424, + "language_loss": 0.7957083, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81869924, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.6152589321136475 + }, + { + "auxiliary_loss_clip": 0.01143351, + "auxiliary_loss_mlp": 0.0114445, + "balance_loss_clip": 1.00264025, + "balance_loss_mlp": 1.00025463, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9027333881463167, + "language_loss": 0.60406977, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62694776, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.1886649131774902 + }, + { + "auxiliary_loss_clip": 0.01178032, + "auxiliary_loss_mlp": 0.01155304, + "balance_loss_clip": 1.00241411, + "balance_loss_mlp": 1.00128543, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.0524188046597995, + "language_loss": 0.81487429, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83820766, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.540771722793579 + }, + { + "auxiliary_loss_clip": 0.01145627, + "auxiliary_loss_mlp": 0.0074862, + "balance_loss_clip": 1.00231326, + "balance_loss_mlp": 1.00085807, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.7404473211894467, + "language_loss": 0.81804013, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83698261, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.6011693477630615 + }, + { + "auxiliary_loss_clip": 0.01146937, + "auxiliary_loss_mlp": 0.01155332, + "balance_loss_clip": 1.00248671, + "balance_loss_mlp": 1.00112247, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 1.6907427642681467, + "language_loss": 0.61177933, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63480198, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.5713181495666504 + }, + { + "auxiliary_loss_clip": 0.01144825, + "auxiliary_loss_mlp": 0.01154551, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00110447, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 1.9250077187419004, + "language_loss": 0.87215698, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89515078, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.5680441856384277 + }, + { + "auxiliary_loss_clip": 0.0113034, + "auxiliary_loss_mlp": 0.01154514, + "balance_loss_clip": 1.00218368, + "balance_loss_mlp": 1.00125861, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 2.1990611964348323, + "language_loss": 0.80518836, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.8280369, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.6123108863830566 + }, + { + "auxiliary_loss_clip": 0.01129048, + "auxiliary_loss_mlp": 0.01154446, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00080884, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 1.7125980769312963, + "language_loss": 0.68367857, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70651352, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 2.6476621627807617 + }, + { + "auxiliary_loss_clip": 0.01177936, + "auxiliary_loss_mlp": 0.01154262, + "balance_loss_clip": 1.00242543, + "balance_loss_mlp": 1.00110173, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.9137349733527225, + "language_loss": 0.81439924, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83772123, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.4944491386413574 + }, + { + "auxiliary_loss_clip": 0.01146086, + "auxiliary_loss_mlp": 0.01154712, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00088429, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.4983381479521545, + "language_loss": 0.85525221, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.8782602, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.6802785396575928 + }, + { + "auxiliary_loss_clip": 0.01128768, + "auxiliary_loss_mlp": 0.01144377, + "balance_loss_clip": 1.00284088, + "balance_loss_mlp": 1.00018179, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7691669503548787, + "language_loss": 0.53300476, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.5557363, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.1548542976379395 + }, + { + "auxiliary_loss_clip": 0.01135589, + "auxiliary_loss_mlp": 0.01155037, + "balance_loss_clip": 1.00246596, + "balance_loss_mlp": 1.00130415, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.7515601844833675, + "language_loss": 0.73549253, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75839877, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 4.182244539260864 + }, + { + "auxiliary_loss_clip": 0.01127792, + "auxiliary_loss_mlp": 0.01154312, + "balance_loss_clip": 1.00215375, + "balance_loss_mlp": 1.00077045, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.7577948865091213, + "language_loss": 0.88276529, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90558636, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.6431827545166016 + }, + { + "auxiliary_loss_clip": 0.01128776, + "auxiliary_loss_mlp": 0.01154315, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00086915, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 2.311260695604064, + "language_loss": 0.73930335, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76213431, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.8532605171203613 + }, + { + "auxiliary_loss_clip": 0.0116093, + "auxiliary_loss_mlp": 0.01154664, + "balance_loss_clip": 1.00236034, + "balance_loss_mlp": 1.00112247, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 5.738251517227469, + "language_loss": 0.74856567, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.7717216, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.679084300994873 + }, + { + "auxiliary_loss_clip": 0.0114568, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_clip": 1.00222695, + "balance_loss_mlp": 1.00084186, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.9456224830215199, + "language_loss": 0.86264336, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88563925, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.5837934017181396 + }, + { + "auxiliary_loss_clip": 0.01145783, + "auxiliary_loss_mlp": 0.01154072, + "balance_loss_clip": 1.00217378, + "balance_loss_mlp": 1.00100768, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.5104538803541279, + "language_loss": 0.68392986, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70692837, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 3.989091396331787 + }, + { + "auxiliary_loss_clip": 0.01145876, + "auxiliary_loss_mlp": 0.01154112, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00076139, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4579053257559362, + "language_loss": 0.76167023, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78467011, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 3.989137887954712 + }, + { + "auxiliary_loss_clip": 0.0117804, + "auxiliary_loss_mlp": 0.01154061, + "balance_loss_clip": 1.00252426, + "balance_loss_mlp": 1.00118661, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8488198741002495, + "language_loss": 0.76115912, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78448009, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 3.959096908569336 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.01154429, + "balance_loss_clip": 1.00228679, + "balance_loss_mlp": 1.00079226, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6232190828027195, + "language_loss": 0.75643861, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.77916205, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.6143412590026855 + }, + { + "auxiliary_loss_clip": 0.01161123, + "auxiliary_loss_mlp": 0.01153915, + "balance_loss_clip": 1.00223529, + "balance_loss_mlp": 1.00113606, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.7040875231776584, + "language_loss": 0.8807832, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90393364, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.559776544570923 + }, + { + "auxiliary_loss_clip": 0.01177733, + "auxiliary_loss_mlp": 0.01153653, + "balance_loss_clip": 1.00226891, + "balance_loss_mlp": 1.00068426, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.333881027117566, + "language_loss": 0.64117855, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66449237, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.625105381011963 + }, + { + "auxiliary_loss_clip": 0.01129708, + "auxiliary_loss_mlp": 0.01153995, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00092983, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.270872879146488, + "language_loss": 0.7438066, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76664364, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.644723415374756 + }, + { + "auxiliary_loss_clip": 0.01160703, + "auxiliary_loss_mlp": 0.01154559, + "balance_loss_clip": 1.002231, + "balance_loss_mlp": 1.00092161, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.0869187802513736, + "language_loss": 0.83769369, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.86084628, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.537116289138794 + }, + { + "auxiliary_loss_clip": 0.01161048, + "auxiliary_loss_mlp": 0.01154207, + "balance_loss_clip": 1.0022136, + "balance_loss_mlp": 1.00104666, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 2.01617342159109, + "language_loss": 0.80802274, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83117527, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.6002719402313232 + }, + { + "auxiliary_loss_clip": 0.01146731, + "auxiliary_loss_mlp": 0.01153531, + "balance_loss_clip": 1.0021292, + "balance_loss_mlp": 1.00094295, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4815115010418967, + "language_loss": 0.8747673, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89776999, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.6800553798675537 + }, + { + "auxiliary_loss_clip": 0.01129887, + "auxiliary_loss_mlp": 0.00748686, + "balance_loss_clip": 1.00220406, + "balance_loss_mlp": 1.00089359, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.544631364175244, + "language_loss": 0.71246147, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73124719, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.7153091430664062 + }, + { + "auxiliary_loss_clip": 0.01095131, + "auxiliary_loss_mlp": 0.01144318, + "balance_loss_clip": 1.00265241, + "balance_loss_mlp": 1.00012231, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8279328622311282, + "language_loss": 0.66338927, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68578368, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.481569766998291 + }, + { + "auxiliary_loss_clip": 0.01161052, + "auxiliary_loss_mlp": 0.01154524, + "balance_loss_clip": 1.00221622, + "balance_loss_mlp": 1.00107789, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.047436978345858, + "language_loss": 0.74216807, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76532376, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.866750717163086 + }, + { + "auxiliary_loss_clip": 0.01129267, + "auxiliary_loss_mlp": 0.01153721, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.00094247, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.8562374740139302, + "language_loss": 0.79236543, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81519532, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.7883734703063965 + }, + { + "auxiliary_loss_clip": 0.01145066, + "auxiliary_loss_mlp": 0.01154228, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00078154, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 1.954532131463045, + "language_loss": 0.76131171, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.78430462, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 2.8831520080566406 + }, + { + "auxiliary_loss_clip": 0.01129185, + "auxiliary_loss_mlp": 0.01153884, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00091445, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.021204632052611, + "language_loss": 0.80248392, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82531452, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.657531261444092 + }, + { + "auxiliary_loss_clip": 0.01129886, + "auxiliary_loss_mlp": 0.01144255, + "balance_loss_clip": 1.0029695, + "balance_loss_mlp": 1.00005913, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.8633324156407781, + "language_loss": 0.6517266, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67446804, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 3.0014865398406982 + }, + { + "auxiliary_loss_clip": 0.01142632, + "auxiliary_loss_mlp": 0.01144765, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00056922, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7871780772148921, + "language_loss": 0.56951821, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59239221, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.280438184738159 + }, + { + "auxiliary_loss_clip": 0.01145755, + "auxiliary_loss_mlp": 0.00748573, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00073838, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.7780480718490532, + "language_loss": 0.80976474, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82870805, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.6269307136535645 + }, + { + "auxiliary_loss_clip": 0.01162287, + "auxiliary_loss_mlp": 0.01154359, + "balance_loss_clip": 1.00236976, + "balance_loss_mlp": 1.00091302, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.6026396683319724, + "language_loss": 0.53911096, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56227744, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.612687110900879 + }, + { + "auxiliary_loss_clip": 0.01144298, + "auxiliary_loss_mlp": 0.01154163, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.0009079, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0337028163734887, + "language_loss": 0.85591584, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.87890047, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.594980001449585 + }, + { + "auxiliary_loss_clip": 0.01177904, + "auxiliary_loss_mlp": 0.01154128, + "balance_loss_clip": 1.00245786, + "balance_loss_mlp": 1.00106287, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.850338749042816, + "language_loss": 0.72070551, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74402583, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.5520031452178955 + }, + { + "auxiliary_loss_clip": 0.01144616, + "auxiliary_loss_mlp": 0.01154379, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.00112319, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.7532835293333968, + "language_loss": 0.76961756, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.79260749, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.60019850730896 + }, + { + "auxiliary_loss_clip": 0.01146663, + "auxiliary_loss_mlp": 0.01153178, + "balance_loss_clip": 1.00225103, + "balance_loss_mlp": 1.00097156, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.6506354882189072, + "language_loss": 0.69224638, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71524477, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.678812026977539 + }, + { + "auxiliary_loss_clip": 0.0113525, + "auxiliary_loss_mlp": 0.01154436, + "balance_loss_clip": 1.00225592, + "balance_loss_mlp": 1.00098944, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.8909889388358847, + "language_loss": 0.81321692, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83611381, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.6844642162323 + }, + { + "auxiliary_loss_clip": 0.01161339, + "auxiliary_loss_mlp": 0.0115391, + "balance_loss_clip": 1.00216019, + "balance_loss_mlp": 1.00075042, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.683931992179469, + "language_loss": 0.76961881, + "learning_rate": 3.703502390349417e-06, + "loss": 0.79277128, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.625403642654419 + }, + { + "auxiliary_loss_clip": 0.01114395, + "auxiliary_loss_mlp": 0.01154431, + "balance_loss_clip": 1.00206316, + "balance_loss_mlp": 1.00098491, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.7715561851523411, + "language_loss": 0.79294544, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81563365, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.682678699493408 + }, + { + "auxiliary_loss_clip": 0.01159899, + "auxiliary_loss_mlp": 0.01144336, + "balance_loss_clip": 1.00267267, + "balance_loss_mlp": 1.00014007, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9270994157892861, + "language_loss": 0.61990809, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64295048, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.0492265224456787 + }, + { + "auxiliary_loss_clip": 0.01128629, + "auxiliary_loss_mlp": 0.0074877, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00093317, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1845519254270096, + "language_loss": 0.80793536, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.82670927, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.6721410751342773 + }, + { + "auxiliary_loss_clip": 0.01095083, + "auxiliary_loss_mlp": 0.01154705, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00116301, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 1.6975076716084074, + "language_loss": 0.74659538, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76909328, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.8662002086639404 + }, + { + "auxiliary_loss_clip": 0.01161941, + "auxiliary_loss_mlp": 0.01155205, + "balance_loss_clip": 1.00232315, + "balance_loss_mlp": 1.00166357, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6367399826400972, + "language_loss": 0.80213445, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82530594, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.581954002380371 + }, + { + "auxiliary_loss_clip": 0.01129401, + "auxiliary_loss_mlp": 0.01155002, + "balance_loss_clip": 1.00220644, + "balance_loss_mlp": 1.00107884, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 2.008355059075764, + "language_loss": 0.77897573, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.8018198, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.641608238220215 + }, + { + "auxiliary_loss_clip": 0.01177917, + "auxiliary_loss_mlp": 0.01154643, + "balance_loss_clip": 1.00237846, + "balance_loss_mlp": 1.00110173, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.2998647004482433, + "language_loss": 0.68953753, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71286309, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 4.079542398452759 + }, + { + "auxiliary_loss_clip": 0.01134518, + "auxiliary_loss_mlp": 0.01154699, + "balance_loss_clip": 1.00245571, + "balance_loss_mlp": 1.00144362, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 1.8484972779209174, + "language_loss": 0.69219798, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71509016, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.705237865447998 + }, + { + "auxiliary_loss_clip": 0.0111397, + "auxiliary_loss_mlp": 0.01154341, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.0008949, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3448267083568455, + "language_loss": 0.66827226, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.69095528, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.8075757026672363 + }, + { + "auxiliary_loss_clip": 0.0116129, + "auxiliary_loss_mlp": 0.0115428, + "balance_loss_clip": 1.00232148, + "balance_loss_mlp": 1.00092876, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 3.009884143903046, + "language_loss": 0.74249899, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76565468, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.5935192108154297 + }, + { + "auxiliary_loss_clip": 0.01129665, + "auxiliary_loss_mlp": 0.0115403, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00115609, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 1.7776583486455981, + "language_loss": 0.71459949, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73743641, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.632988691329956 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01154707, + "balance_loss_clip": 1.00200903, + "balance_loss_mlp": 1.00126052, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 2.2122778017303104, + "language_loss": 0.7261799, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74890906, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 4.0132505893707275 + }, + { + "auxiliary_loss_clip": 0.01130085, + "auxiliary_loss_mlp": 0.01154767, + "balance_loss_clip": 1.00216901, + "balance_loss_mlp": 1.00122571, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.923047088427291, + "language_loss": 0.80902135, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83186984, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.7610363960266113 + }, + { + "auxiliary_loss_clip": 0.01177884, + "auxiliary_loss_mlp": 0.01154392, + "balance_loss_clip": 1.00240302, + "balance_loss_mlp": 1.00104117, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.764582924221514, + "language_loss": 0.83971715, + "learning_rate": 3.700639264372948e-06, + "loss": 0.86303991, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 5.378782510757446 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01153534, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00104141, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.668660430390074, + "language_loss": 0.68139297, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70406193, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.655486583709717 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01154273, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.00130367, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.243340209382307, + "language_loss": 0.73471099, + "learning_rate": 3.70022921406487e-06, + "loss": 0.7575931, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.617711067199707 + }, + { + "auxiliary_loss_clip": 0.01160582, + "auxiliary_loss_mlp": 0.01154849, + "balance_loss_clip": 1.00217509, + "balance_loss_mlp": 1.00130737, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.6156059412997767, + "language_loss": 0.86819065, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89134502, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.56329083442688 + }, + { + "auxiliary_loss_clip": 0.01111873, + "auxiliary_loss_mlp": 0.01153516, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00083292, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.6324650604872166, + "language_loss": 0.70954263, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73219657, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.762188196182251 + }, + { + "auxiliary_loss_clip": 0.01128725, + "auxiliary_loss_mlp": 0.01154023, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00114942, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.6104294587243255, + "language_loss": 0.7125845, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73541194, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.618699550628662 + }, + { + "auxiliary_loss_clip": 0.01146108, + "auxiliary_loss_mlp": 0.0115414, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00088489, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.1856592660239778, + "language_loss": 0.75906396, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78206646, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.5998306274414062 + }, + { + "auxiliary_loss_clip": 0.01145665, + "auxiliary_loss_mlp": 0.0115461, + "balance_loss_clip": 1.00211608, + "balance_loss_mlp": 1.00106823, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.722222142123101, + "language_loss": 0.80598807, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82899094, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.599865436553955 + }, + { + "auxiliary_loss_clip": 0.01162335, + "auxiliary_loss_mlp": 0.01154578, + "balance_loss_clip": 1.00236154, + "balance_loss_mlp": 1.00113153, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.8195844532633594, + "language_loss": 0.80836481, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.83153391, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.577286720275879 + }, + { + "auxiliary_loss_clip": 0.01144215, + "auxiliary_loss_mlp": 0.01154138, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00097764, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 1.9890263619458146, + "language_loss": 0.90079319, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92377663, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.551669120788574 + }, + { + "auxiliary_loss_clip": 0.01143222, + "auxiliary_loss_mlp": 0.00748023, + "balance_loss_clip": 1.00262952, + "balance_loss_mlp": 1.00070941, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8636424004966211, + "language_loss": 0.55887073, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57778317, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.117891788482666 + }, + { + "auxiliary_loss_clip": 0.01145501, + "auxiliary_loss_mlp": 0.0074873, + "balance_loss_clip": 1.00217819, + "balance_loss_mlp": 1.00099492, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 2.1033769259797968, + "language_loss": 0.84427977, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86322218, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.583892822265625 + }, + { + "auxiliary_loss_clip": 0.01146885, + "auxiliary_loss_mlp": 0.01155363, + "balance_loss_clip": 1.00240695, + "balance_loss_mlp": 1.0010581, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.866558639027684, + "language_loss": 0.69454932, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71757174, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.5695388317108154 + }, + { + "auxiliary_loss_clip": 0.01149508, + "auxiliary_loss_mlp": 0.01153865, + "balance_loss_clip": 1.00219262, + "balance_loss_mlp": 1.00080061, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.709710807778429, + "language_loss": 0.72048855, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74352229, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.5609755516052246 + }, + { + "auxiliary_loss_clip": 0.01162027, + "auxiliary_loss_mlp": 0.01154331, + "balance_loss_clip": 1.00212049, + "balance_loss_mlp": 1.00136137, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.722053550111269, + "language_loss": 0.82970554, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85286909, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.5297293663024902 + }, + { + "auxiliary_loss_clip": 0.01161114, + "auxiliary_loss_mlp": 0.01143424, + "balance_loss_clip": 1.0027895, + "balance_loss_mlp": 0.99999136, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7919043755824701, + "language_loss": 0.59001136, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61305678, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.069361686706543 + }, + { + "auxiliary_loss_clip": 0.01099569, + "auxiliary_loss_mlp": 0.01154509, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00115776, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.9458930396007914, + "language_loss": 0.6299935, + "learning_rate": 3.697351644435763e-06, + "loss": 0.65253431, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.706310510635376 + }, + { + "auxiliary_loss_clip": 0.01145748, + "auxiliary_loss_mlp": 0.01155005, + "balance_loss_clip": 1.00234532, + "balance_loss_mlp": 1.00127256, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 1.9416381628629917, + "language_loss": 0.75352776, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77653533, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.6023616790771484 + }, + { + "auxiliary_loss_clip": 0.01162342, + "auxiliary_loss_mlp": 0.00748803, + "balance_loss_clip": 1.00235343, + "balance_loss_mlp": 1.00106907, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.5442952121823006, + "language_loss": 0.76644343, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78555489, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.5600340366363525 + }, + { + "auxiliary_loss_clip": 0.01160802, + "auxiliary_loss_mlp": 0.01154666, + "balance_loss_clip": 1.0021621, + "balance_loss_mlp": 1.00122023, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.8874398341114675, + "language_loss": 0.75194305, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77509773, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.6362802982330322 + }, + { + "auxiliary_loss_clip": 0.01112373, + "auxiliary_loss_mlp": 0.01154891, + "balance_loss_clip": 1.0020169, + "balance_loss_mlp": 1.00106311, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.5891973663327037, + "language_loss": 0.71511745, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73779011, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.7594504356384277 + }, + { + "auxiliary_loss_clip": 0.01128961, + "auxiliary_loss_mlp": 0.01154459, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00101316, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 2.631228896973514, + "language_loss": 0.85522425, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87805843, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.609314203262329 + }, + { + "auxiliary_loss_clip": 0.01128288, + "auxiliary_loss_mlp": 0.01154091, + "balance_loss_clip": 1.00192785, + "balance_loss_mlp": 1.00093126, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 2.319670696917516, + "language_loss": 0.69730848, + "learning_rate": 3.696114537236335e-06, + "loss": 0.72013229, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.724400758743286 + }, + { + "auxiliary_loss_clip": 0.0116222, + "auxiliary_loss_mlp": 0.01154418, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00097167, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.8366118788288956, + "language_loss": 0.68652058, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70968693, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.654323101043701 + }, + { + "auxiliary_loss_clip": 0.01128346, + "auxiliary_loss_mlp": 0.01154008, + "balance_loss_clip": 1.00218117, + "balance_loss_mlp": 1.00103855, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 2.2727048329997728, + "language_loss": 0.77677453, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.7995981, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.621748685836792 + }, + { + "auxiliary_loss_clip": 0.01150741, + "auxiliary_loss_mlp": 0.01155038, + "balance_loss_clip": 1.00227499, + "balance_loss_mlp": 1.00140083, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.7793471060674904, + "language_loss": 0.65287125, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67592901, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.5384511947631836 + }, + { + "auxiliary_loss_clip": 0.01159205, + "auxiliary_loss_mlp": 0.01143503, + "balance_loss_clip": 1.00270391, + "balance_loss_mlp": 1.00006998, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6744681794888362, + "language_loss": 0.58113134, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.6041584, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.1820578575134277 + }, + { + "auxiliary_loss_clip": 0.01132843, + "auxiliary_loss_mlp": 0.0115365, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.0008713, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.7306149920969554, + "language_loss": 0.91904497, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94190991, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.674269676208496 + }, + { + "auxiliary_loss_clip": 0.01162028, + "auxiliary_loss_mlp": 0.01154598, + "balance_loss_clip": 1.00222111, + "balance_loss_mlp": 1.00115216, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.9100237662072754, + "language_loss": 0.78800452, + "learning_rate": 3.694875114631167e-06, + "loss": 0.81117082, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.5660815238952637 + }, + { + "auxiliary_loss_clip": 0.01113131, + "auxiliary_loss_mlp": 0.01153634, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00104666, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.7444459184829346, + "language_loss": 0.71393913, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73660684, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.7647430896759033 + }, + { + "auxiliary_loss_clip": 0.01149956, + "auxiliary_loss_mlp": 0.01143559, + "balance_loss_clip": 1.00291049, + "balance_loss_mlp": 1.00012636, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9676026472547185, + "language_loss": 0.62503719, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64797235, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.140347480773926 + }, + { + "auxiliary_loss_clip": 0.01177469, + "auxiliary_loss_mlp": 0.01154262, + "balance_loss_clip": 1.00219786, + "balance_loss_mlp": 1.0013876, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5585066219378816, + "language_loss": 0.82484782, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84816515, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 3.8826379776000977 + }, + { + "auxiliary_loss_clip": 0.01162156, + "auxiliary_loss_mlp": 0.0115405, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00079465, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.934457876942301, + "language_loss": 0.81427896, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83744109, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.5750582218170166 + }, + { + "auxiliary_loss_clip": 0.01144166, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00109136, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.8306413814209443, + "language_loss": 0.76704633, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79002768, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.5682523250579834 + }, + { + "auxiliary_loss_clip": 0.01101122, + "auxiliary_loss_mlp": 0.01154251, + "balance_loss_clip": 1.0021162, + "balance_loss_mlp": 1.00090003, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9649851867255699, + "language_loss": 0.79874909, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82130289, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.728498935699463 + }, + { + "auxiliary_loss_clip": 0.01160229, + "auxiliary_loss_mlp": 0.011535, + "balance_loss_clip": 1.0020287, + "balance_loss_mlp": 1.00091231, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 2.131118334517923, + "language_loss": 0.86484003, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.8879773, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 3.939443588256836 + }, + { + "auxiliary_loss_clip": 0.01177777, + "auxiliary_loss_mlp": 0.01153725, + "balance_loss_clip": 1.00251794, + "balance_loss_mlp": 1.00094676, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.6916557286451157, + "language_loss": 0.74888384, + "learning_rate": 3.693218952340186e-06, + "loss": 0.7721988, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.57279896736145 + }, + { + "auxiliary_loss_clip": 0.01146901, + "auxiliary_loss_mlp": 0.011542, + "balance_loss_clip": 1.00221145, + "balance_loss_mlp": 1.00123048, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 2.0054960242517215, + "language_loss": 0.79347849, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81648952, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 4.078248500823975 + }, + { + "auxiliary_loss_clip": 0.01128542, + "auxiliary_loss_mlp": 0.00748933, + "balance_loss_clip": 1.00211763, + "balance_loss_mlp": 1.00134158, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 2.7189909004389197, + "language_loss": 0.80326784, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82204258, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 4.00873589515686 + }, + { + "auxiliary_loss_clip": 0.0113003, + "auxiliary_loss_mlp": 0.01153577, + "balance_loss_clip": 1.00222778, + "balance_loss_mlp": 1.0007031, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 1.9884041033818904, + "language_loss": 0.74623668, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76907271, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.6051571369171143 + }, + { + "auxiliary_loss_clip": 0.01160925, + "auxiliary_loss_mlp": 0.01154261, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.00091028, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.5900166946602683, + "language_loss": 0.76502669, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78817856, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.537691354751587 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01154357, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00129199, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.6747030541755794, + "language_loss": 0.68430734, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70697218, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.6974716186523438 + }, + { + "auxiliary_loss_clip": 0.01112132, + "auxiliary_loss_mlp": 0.0115404, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00126159, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.4012555485374927, + "language_loss": 0.81249017, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83515191, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.7057061195373535 + }, + { + "auxiliary_loss_clip": 0.01145194, + "auxiliary_loss_mlp": 0.01153495, + "balance_loss_clip": 1.0021621, + "balance_loss_mlp": 1.00081182, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.5257383574114574, + "language_loss": 0.7966997, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.81968659, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.5458991527557373 + }, + { + "auxiliary_loss_clip": 0.0117756, + "auxiliary_loss_mlp": 0.0115378, + "balance_loss_clip": 1.002316, + "balance_loss_mlp": 1.00090647, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.991115383024545, + "language_loss": 0.71861917, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74193251, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.470522880554199 + }, + { + "auxiliary_loss_clip": 0.01160841, + "auxiliary_loss_mlp": 0.01153343, + "balance_loss_clip": 1.00226676, + "balance_loss_mlp": 1.00085044, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 2.0198104502430962, + "language_loss": 0.87146473, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89460659, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.5058746337890625 + }, + { + "auxiliary_loss_clip": 0.01144997, + "auxiliary_loss_mlp": 0.01153377, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00088477, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 2.2689233821655983, + "language_loss": 0.71200883, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73499256, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.60583233833313 + }, + { + "auxiliary_loss_clip": 0.0112795, + "auxiliary_loss_mlp": 0.01153557, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00096953, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.193142192306095, + "language_loss": 0.86407381, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88688892, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.5663115978240967 + }, + { + "auxiliary_loss_clip": 0.01162071, + "auxiliary_loss_mlp": 0.01154182, + "balance_loss_clip": 1.00234938, + "balance_loss_mlp": 1.00121307, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.441655138520036, + "language_loss": 0.80707753, + "learning_rate": 3.69072700532013e-06, + "loss": 0.83024007, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.5738158226013184 + }, + { + "auxiliary_loss_clip": 0.01146318, + "auxiliary_loss_mlp": 0.01153221, + "balance_loss_clip": 1.00227451, + "balance_loss_mlp": 1.00082374, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.8670143745521297, + "language_loss": 0.858904, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88189942, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.604776382446289 + }, + { + "auxiliary_loss_clip": 0.01160979, + "auxiliary_loss_mlp": 0.01153224, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00101757, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.868741270027683, + "language_loss": 0.83717501, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86031705, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.5766103267669678 + }, + { + "auxiliary_loss_clip": 0.01160753, + "auxiliary_loss_mlp": 0.0115334, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.00065708, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 1.818278722988165, + "language_loss": 0.83757758, + "learning_rate": 3.690102575501033e-06, + "loss": 0.86071849, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.5520079135894775 + }, + { + "auxiliary_loss_clip": 0.01129822, + "auxiliary_loss_mlp": 0.01152939, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.0008285, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9129597900006388, + "language_loss": 0.77311242, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79594004, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.668466567993164 + }, + { + "auxiliary_loss_clip": 0.01143889, + "auxiliary_loss_mlp": 0.0115365, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00087118, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 3.0306520580628504, + "language_loss": 0.87589449, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89886987, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.5662529468536377 + }, + { + "auxiliary_loss_clip": 0.0114576, + "auxiliary_loss_mlp": 0.01153408, + "balance_loss_clip": 1.00234556, + "balance_loss_mlp": 1.00091588, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 1.911924260274822, + "language_loss": 0.78136909, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80436081, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.5495142936706543 + }, + { + "auxiliary_loss_clip": 0.01160775, + "auxiliary_loss_mlp": 0.0115315, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00084829, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.184492643718978, + "language_loss": 0.76733005, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.79046935, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.5331568717956543 + }, + { + "auxiliary_loss_clip": 0.0112753, + "auxiliary_loss_mlp": 0.00748668, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00115383, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6028424418650027, + "language_loss": 0.79581916, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81458116, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.656050443649292 + }, + { + "auxiliary_loss_clip": 0.01144811, + "auxiliary_loss_mlp": 0.01153148, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00094175, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.8644898394082767, + "language_loss": 0.69774562, + "learning_rate": 3.688851985676991e-06, + "loss": 0.72072524, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.7132182121276855 + }, + { + "auxiliary_loss_clip": 0.01129597, + "auxiliary_loss_mlp": 0.01152997, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00088644, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6555187569612695, + "language_loss": 0.80751657, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83034259, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.6063756942749023 + }, + { + "auxiliary_loss_clip": 0.01162082, + "auxiliary_loss_mlp": 0.01153132, + "balance_loss_clip": 1.00234544, + "balance_loss_mlp": 1.00092578, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.8988503807635604, + "language_loss": 0.83250809, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.8556602, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.544264316558838 + }, + { + "auxiliary_loss_clip": 0.01161808, + "auxiliary_loss_mlp": 0.01153435, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00113308, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.7016784667417333, + "language_loss": 0.8578229, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88097537, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.5337188243865967 + }, + { + "auxiliary_loss_clip": 0.01128049, + "auxiliary_loss_mlp": 0.01152793, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00077736, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 1.9769724949357723, + "language_loss": 0.84972227, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87253064, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.616457939147949 + }, + { + "auxiliary_loss_clip": 0.01177412, + "auxiliary_loss_mlp": 0.01152547, + "balance_loss_clip": 1.00229728, + "balance_loss_mlp": 1.00081706, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 1.9292823200385536, + "language_loss": 0.67651755, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.69981718, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.547356605529785 + }, + { + "auxiliary_loss_clip": 0.01177238, + "auxiliary_loss_mlp": 0.01153026, + "balance_loss_clip": 1.00216985, + "balance_loss_mlp": 1.00091493, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.9761328003147, + "language_loss": 0.84246808, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.8657707, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.516369581222534 + }, + { + "auxiliary_loss_clip": 0.01177453, + "auxiliary_loss_mlp": 0.01153285, + "balance_loss_clip": 1.00234473, + "balance_loss_mlp": 1.00088763, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.5719828974262127, + "language_loss": 0.64205897, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66536641, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.448676347732544 + }, + { + "auxiliary_loss_clip": 0.01160583, + "auxiliary_loss_mlp": 0.01153008, + "balance_loss_clip": 1.0021764, + "balance_loss_mlp": 1.00089729, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.3829794476135653, + "language_loss": 0.80469489, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82783085, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.558032512664795 + }, + { + "auxiliary_loss_clip": 0.01095072, + "auxiliary_loss_mlp": 0.01153219, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.00101209, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.509612193297198, + "language_loss": 0.76059949, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78308231, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.708754539489746 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01153031, + "balance_loss_clip": 1.00236964, + "balance_loss_mlp": 1.00092006, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.029657481809269, + "language_loss": 0.73702413, + "learning_rate": 3.686762546833722e-06, + "loss": 0.76017576, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.5999345779418945 + }, + { + "auxiliary_loss_clip": 0.0114495, + "auxiliary_loss_mlp": 0.01153297, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.00109017, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.019170588814694, + "language_loss": 0.77790105, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80088353, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 3.9910991191864014 + }, + { + "auxiliary_loss_clip": 0.0112948, + "auxiliary_loss_mlp": 0.01153051, + "balance_loss_clip": 1.00220513, + "balance_loss_mlp": 1.00084484, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 2.2222420621028194, + "language_loss": 0.84997815, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.87280345, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.6276681423187256 + }, + { + "auxiliary_loss_clip": 0.01160477, + "auxiliary_loss_mlp": 0.01152945, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00073814, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9264072633787654, + "language_loss": 0.81106889, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.83420312, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.6280806064605713 + }, + { + "auxiliary_loss_clip": 0.01096016, + "auxiliary_loss_mlp": 0.01153555, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.00106239, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.9882345862129178, + "language_loss": 0.72962475, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75212049, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.743730306625366 + }, + { + "auxiliary_loss_clip": 0.01160551, + "auxiliary_loss_mlp": 0.01152773, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.00085235, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.38298593458651, + "language_loss": 0.79044133, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.81357455, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 4.032206058502197 + }, + { + "auxiliary_loss_clip": 0.01161405, + "auxiliary_loss_mlp": 0.01152891, + "balance_loss_clip": 1.00219035, + "balance_loss_mlp": 1.00097084, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.0607980292653996, + "language_loss": 0.87249482, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89563787, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.5097861289978027 + }, + { + "auxiliary_loss_clip": 0.01145877, + "auxiliary_loss_mlp": 0.01152992, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00078607, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.7720112769139376, + "language_loss": 0.62399912, + "learning_rate": 3.685296133421035e-06, + "loss": 0.6469878, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.5812346935272217 + }, + { + "auxiliary_loss_clip": 0.01144891, + "auxiliary_loss_mlp": 0.01154206, + "balance_loss_clip": 1.00220299, + "balance_loss_mlp": 1.00114167, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 1.8633872571677788, + "language_loss": 0.86334425, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88633525, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 4.039443016052246 + }, + { + "auxiliary_loss_clip": 0.01129318, + "auxiliary_loss_mlp": 0.00748858, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00115836, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.221323760381233, + "language_loss": 0.71365976, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73244154, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 4.087466478347778 + }, + { + "auxiliary_loss_clip": 0.01177153, + "auxiliary_loss_mlp": 0.01152594, + "balance_loss_clip": 1.00218356, + "balance_loss_mlp": 1.00086415, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 1.9042237107374402, + "language_loss": 0.70794821, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73124564, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.51558780670166 + }, + { + "auxiliary_loss_clip": 0.01161044, + "auxiliary_loss_mlp": 0.01142026, + "balance_loss_clip": 1.00281799, + "balance_loss_mlp": 1.00011981, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7545778532093489, + "language_loss": 0.55524111, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57827181, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.1753532886505127 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00085735, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.828789223685942, + "language_loss": 0.71769285, + "learning_rate": 3.684246777912353e-06, + "loss": 0.74032938, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.723029136657715 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.00748758, + "balance_loss_clip": 1.00249195, + "balance_loss_mlp": 1.0011549, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 2.5551918564547207, + "language_loss": 0.7510072, + "learning_rate": 3.684036715178351e-06, + "loss": 0.7698251, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.6408612728118896 + }, + { + "auxiliary_loss_clip": 0.01128656, + "auxiliary_loss_mlp": 0.01152898, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00107336, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.7896667447864438, + "language_loss": 0.88473666, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90755218, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.6619486808776855 + }, + { + "auxiliary_loss_clip": 0.01160704, + "auxiliary_loss_mlp": 0.01153364, + "balance_loss_clip": 1.00218821, + "balance_loss_mlp": 1.00077558, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.6494975680989101, + "language_loss": 0.77016026, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79330099, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.6731204986572266 + }, + { + "auxiliary_loss_clip": 0.01177281, + "auxiliary_loss_mlp": 0.01153097, + "balance_loss_clip": 1.00228095, + "balance_loss_mlp": 1.00089014, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.5373082765347104, + "language_loss": 0.73948514, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76278889, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.511644124984741 + }, + { + "auxiliary_loss_clip": 0.01144206, + "auxiliary_loss_mlp": 0.01153765, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00089097, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.7136595231917058, + "language_loss": 0.73426008, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75723976, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.6156556606292725 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01153801, + "balance_loss_clip": 1.00241184, + "balance_loss_mlp": 1.00102258, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.845208879568995, + "language_loss": 0.85358959, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87674975, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.5487215518951416 + }, + { + "auxiliary_loss_clip": 0.01096503, + "auxiliary_loss_mlp": 0.01153979, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00110471, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.514455066575086, + "language_loss": 0.68768716, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71019197, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.700458526611328 + }, + { + "auxiliary_loss_clip": 0.01109878, + "auxiliary_loss_mlp": 0.01142717, + "balance_loss_clip": 1.00165164, + "balance_loss_mlp": 1.00004756, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8141236403474136, + "language_loss": 0.60251683, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.6250428, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.3414230346679688 + }, + { + "auxiliary_loss_clip": 0.01160581, + "auxiliary_loss_mlp": 0.0115362, + "balance_loss_clip": 1.0021975, + "balance_loss_mlp": 1.00112748, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.9632263202193445, + "language_loss": 0.72115362, + "learning_rate": 3.682353915057679e-06, + "loss": 0.7442956, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.539663553237915 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01153455, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00077224, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.8980115620908888, + "language_loss": 0.86891699, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89159453, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.653878688812256 + }, + { + "auxiliary_loss_clip": 0.01161853, + "auxiliary_loss_mlp": 0.011535, + "balance_loss_clip": 1.00220919, + "balance_loss_mlp": 1.00081646, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6897064344127326, + "language_loss": 0.6945008, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71765435, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.5912768840789795 + }, + { + "auxiliary_loss_clip": 0.01145444, + "auxiliary_loss_mlp": 0.01153019, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.00081289, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 2.2776911933169366, + "language_loss": 0.8934117, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91639632, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.614802598953247 + }, + { + "auxiliary_loss_clip": 0.01129589, + "auxiliary_loss_mlp": 0.0115325, + "balance_loss_clip": 1.002105, + "balance_loss_mlp": 1.00075769, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.6009947670699298, + "language_loss": 0.76757622, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.79040462, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.6444268226623535 + }, + { + "auxiliary_loss_clip": 0.01165865, + "auxiliary_loss_mlp": 0.01153389, + "balance_loss_clip": 1.00226355, + "balance_loss_mlp": 1.00108767, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.5585049523861727, + "language_loss": 0.77687484, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.80006737, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.520137071609497 + }, + { + "auxiliary_loss_clip": 0.01159629, + "auxiliary_loss_mlp": 0.01142871, + "balance_loss_clip": 1.0026722, + "balance_loss_mlp": 1.00020146, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.82598853901162, + "language_loss": 0.67030096, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.693326, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.098032236099243 + }, + { + "auxiliary_loss_clip": 0.01160569, + "auxiliary_loss_mlp": 0.01153337, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00093961, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 3.361852438368205, + "language_loss": 0.84590155, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86904061, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.5312700271606445 + }, + { + "auxiliary_loss_clip": 0.01161502, + "auxiliary_loss_mlp": 0.01153749, + "balance_loss_clip": 1.00228024, + "balance_loss_mlp": 1.00097036, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.141171520337317, + "language_loss": 0.85006219, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87321472, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.5189881324768066 + }, + { + "auxiliary_loss_clip": 0.01116199, + "auxiliary_loss_mlp": 0.01153686, + "balance_loss_clip": 1.00224924, + "balance_loss_mlp": 1.00081158, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.7834904751270635, + "language_loss": 0.86110508, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88380396, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.8216936588287354 + }, + { + "auxiliary_loss_clip": 0.01084198, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00091255, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.8186477025423236, + "language_loss": 0.73102856, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75341129, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.738417625427246 + }, + { + "auxiliary_loss_clip": 0.01143808, + "auxiliary_loss_mlp": 0.00748757, + "balance_loss_clip": 1.00189006, + "balance_loss_mlp": 1.00120115, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.0547839593035113, + "language_loss": 0.85396868, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87289429, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.6146538257598877 + }, + { + "auxiliary_loss_clip": 0.01109481, + "auxiliary_loss_mlp": 0.01141917, + "balance_loss_clip": 1.00187385, + "balance_loss_mlp": 1.00001001, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6850949047731496, + "language_loss": 0.57076812, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5932821, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.268097400665283 + }, + { + "auxiliary_loss_clip": 0.01177353, + "auxiliary_loss_mlp": 0.00748778, + "balance_loss_clip": 1.00236201, + "balance_loss_mlp": 1.00116861, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.4793731732185287, + "language_loss": 0.78371894, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80298024, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.808037281036377 + }, + { + "auxiliary_loss_clip": 0.01162104, + "auxiliary_loss_mlp": 0.01153946, + "balance_loss_clip": 1.00228524, + "balance_loss_mlp": 1.00097704, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 1.9675001034059567, + "language_loss": 0.62601185, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64917231, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.643880605697632 + }, + { + "auxiliary_loss_clip": 0.01115084, + "auxiliary_loss_mlp": 0.01153159, + "balance_loss_clip": 1.00197899, + "balance_loss_mlp": 1.0009526, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 3.029259687407082, + "language_loss": 0.86138487, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88406736, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.676828145980835 + }, + { + "auxiliary_loss_clip": 0.01144765, + "auxiliary_loss_mlp": 0.01153046, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.00084019, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 2.240581644622846, + "language_loss": 0.75215679, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77513492, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.664499521255493 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.0115332, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00101829, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.9557638606010788, + "language_loss": 0.76638567, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78937864, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 4.0623719692230225 + }, + { + "auxiliary_loss_clip": 0.01128304, + "auxiliary_loss_mlp": 0.01153194, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00098729, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5652594285498558, + "language_loss": 0.82458586, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84740078, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.6190168857574463 + }, + { + "auxiliary_loss_clip": 0.01176114, + "auxiliary_loss_mlp": 0.01141935, + "balance_loss_clip": 1.00281405, + "balance_loss_mlp": 1.00002849, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7866470166961399, + "language_loss": 0.56565738, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58883786, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 2.990994453430176 + }, + { + "auxiliary_loss_clip": 0.01128308, + "auxiliary_loss_mlp": 0.00748806, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.00108063, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 7.215197111524685, + "language_loss": 0.88090986, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.89968097, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.596919298171997 + }, + { + "auxiliary_loss_clip": 0.01166092, + "auxiliary_loss_mlp": 0.01153337, + "balance_loss_clip": 1.00259781, + "balance_loss_mlp": 1.00084472, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.638124940317573, + "language_loss": 0.80326521, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82645953, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.595205068588257 + }, + { + "auxiliary_loss_clip": 0.01130773, + "auxiliary_loss_mlp": 0.00748699, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00099421, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 2.520897614332448, + "language_loss": 0.76704764, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78584242, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 3.9761264324188232 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01153068, + "balance_loss_clip": 1.00257277, + "balance_loss_mlp": 1.00086164, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.7520235178136356, + "language_loss": 0.80944175, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.8323136, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.626762866973877 + }, + { + "auxiliary_loss_clip": 0.01113757, + "auxiliary_loss_mlp": 0.00748759, + "balance_loss_clip": 1.002074, + "balance_loss_mlp": 1.00112724, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.6731703540612155, + "language_loss": 0.77927011, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.79789525, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 2.6669774055480957 + }, + { + "auxiliary_loss_clip": 0.01098061, + "auxiliary_loss_mlp": 0.01153195, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.000893, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.746476247369602, + "language_loss": 0.83418739, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85669994, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 4.057554483413696 + }, + { + "auxiliary_loss_clip": 0.01161728, + "auxiliary_loss_mlp": 0.00748618, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00105715, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 2.0834088108705315, + "language_loss": 0.7575103, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77661377, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 4.1209821701049805 + }, + { + "auxiliary_loss_clip": 0.01095895, + "auxiliary_loss_mlp": 0.01152222, + "balance_loss_clip": 1.00177526, + "balance_loss_mlp": 1.00068331, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 1.883003982443759, + "language_loss": 0.77244115, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79492235, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.7649636268615723 + }, + { + "auxiliary_loss_clip": 0.01096998, + "auxiliary_loss_mlp": 0.01152751, + "balance_loss_clip": 1.00173962, + "balance_loss_mlp": 1.00083065, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 1.7201016573583001, + "language_loss": 0.75718129, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77967876, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.7278523445129395 + }, + { + "auxiliary_loss_clip": 0.01145174, + "auxiliary_loss_mlp": 0.01153051, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.00074983, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8715474962849254, + "language_loss": 0.88686383, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90984613, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.6333603858947754 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.00747677, + "balance_loss_clip": 1.00241899, + "balance_loss_mlp": 1.00016332, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.763757091890151, + "language_loss": 0.59081954, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.60941577, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.3500750064849854 + }, + { + "auxiliary_loss_clip": 0.01145101, + "auxiliary_loss_mlp": 0.0115322, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00082254, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.5608428473937654, + "language_loss": 0.65919578, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68217897, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.7323405742645264 + }, + { + "auxiliary_loss_clip": 0.01128485, + "auxiliary_loss_mlp": 0.01153487, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00089908, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 1.9296714252880893, + "language_loss": 0.84064364, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.8634634, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.7124087810516357 + }, + { + "auxiliary_loss_clip": 0.01112634, + "auxiliary_loss_mlp": 0.01153306, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.00081325, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.217444797527276, + "language_loss": 0.81871867, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84137803, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.753413677215576 + }, + { + "auxiliary_loss_clip": 0.01160463, + "auxiliary_loss_mlp": 0.01152941, + "balance_loss_clip": 1.00218821, + "balance_loss_mlp": 1.0008297, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.784268575504575, + "language_loss": 0.82004023, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84317422, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.5453028678894043 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01151998, + "balance_loss_clip": 1.00232565, + "balance_loss_mlp": 1.00093603, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 1.839149478187678, + "language_loss": 0.81786048, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84115231, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.4802567958831787 + }, + { + "auxiliary_loss_clip": 0.01160726, + "auxiliary_loss_mlp": 0.01153231, + "balance_loss_clip": 1.00223768, + "balance_loss_mlp": 1.00092936, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.8990507468940518, + "language_loss": 0.90529454, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92843413, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.5488951206207275 + }, + { + "auxiliary_loss_clip": 0.01144554, + "auxiliary_loss_mlp": 0.01153543, + "balance_loss_clip": 1.00212038, + "balance_loss_mlp": 1.0009551, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.8816741512426147, + "language_loss": 0.76748347, + "learning_rate": 3.674517919597092e-06, + "loss": 0.79046446, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.715165615081787 + }, + { + "auxiliary_loss_clip": 0.0114379, + "auxiliary_loss_mlp": 0.01152912, + "balance_loss_clip": 1.0021683, + "balance_loss_mlp": 1.00089681, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.6616781568052046, + "language_loss": 0.75370377, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77667081, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.6154558658599854 + }, + { + "auxiliary_loss_clip": 0.01127474, + "auxiliary_loss_mlp": 0.011535, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.00091243, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 3.748190834295866, + "language_loss": 0.75506228, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77787203, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.7016777992248535 + }, + { + "auxiliary_loss_clip": 0.011658, + "auxiliary_loss_mlp": 0.01152874, + "balance_loss_clip": 1.00225914, + "balance_loss_mlp": 1.00076294, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.8907535647833489, + "language_loss": 0.84777868, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.87096548, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.535825729370117 + }, + { + "auxiliary_loss_clip": 0.01127961, + "auxiliary_loss_mlp": 0.01143311, + "balance_loss_clip": 1.00237322, + "balance_loss_mlp": 1.00140464, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.874099601991925, + "language_loss": 0.63625038, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65896308, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.1164393424987793 + }, + { + "auxiliary_loss_clip": 0.01145154, + "auxiliary_loss_mlp": 0.01153134, + "balance_loss_clip": 1.00232279, + "balance_loss_mlp": 1.00083232, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.1990917356598496, + "language_loss": 0.70098293, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72396588, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.7492356300354004 + }, + { + "auxiliary_loss_clip": 0.01177396, + "auxiliary_loss_mlp": 0.01153129, + "balance_loss_clip": 1.00237942, + "balance_loss_mlp": 1.00082684, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.9387263744372578, + "language_loss": 0.69991618, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72322142, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.5259761810302734 + }, + { + "auxiliary_loss_clip": 0.01143974, + "auxiliary_loss_mlp": 0.01152743, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00101364, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.087999843640943, + "language_loss": 0.89076018, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9137274, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.5804457664489746 + }, + { + "auxiliary_loss_clip": 0.01112716, + "auxiliary_loss_mlp": 0.01152577, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00084782, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.3840080950930713, + "language_loss": 0.67798173, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70063466, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.6858487129211426 + }, + { + "auxiliary_loss_clip": 0.01131006, + "auxiliary_loss_mlp": 0.01153091, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00107539, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.5402006533412176, + "language_loss": 0.84756625, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87040722, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.578747510910034 + }, + { + "auxiliary_loss_clip": 0.01133656, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_clip": 1.00228286, + "balance_loss_mlp": 1.00097334, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.3636623136302926, + "language_loss": 0.74060512, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76346964, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.613680601119995 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01152091, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00093389, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.0766046162151923, + "language_loss": 0.7618621, + "learning_rate": 3.67217151746346e-06, + "loss": 0.78466165, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.5607688426971436 + }, + { + "auxiliary_loss_clip": 0.01113437, + "auxiliary_loss_mlp": 0.01152622, + "balance_loss_clip": 1.00189865, + "balance_loss_mlp": 1.00108349, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 1.966475006295988, + "language_loss": 0.8550095, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87767005, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.7488009929656982 + }, + { + "auxiliary_loss_clip": 0.01094731, + "auxiliary_loss_mlp": 0.01152291, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.0008471, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 1.7659480239378043, + "language_loss": 0.71175218, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73422241, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.769897222518921 + }, + { + "auxiliary_loss_clip": 0.0114401, + "auxiliary_loss_mlp": 0.01153182, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00116634, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.6804545169072707, + "language_loss": 0.75046456, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77343643, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.570868730545044 + }, + { + "auxiliary_loss_clip": 0.01145747, + "auxiliary_loss_mlp": 0.01153417, + "balance_loss_clip": 1.00232387, + "balance_loss_mlp": 1.00092435, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 2.05786227819184, + "language_loss": 0.70500904, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72800064, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.6532363891601562 + }, + { + "auxiliary_loss_clip": 0.01113253, + "auxiliary_loss_mlp": 0.00748636, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00087762, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 2.6417299599259594, + "language_loss": 0.83463228, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85325116, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.714606523513794 + }, + { + "auxiliary_loss_clip": 0.01160436, + "auxiliary_loss_mlp": 0.01152693, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00105929, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.6876639200390986, + "language_loss": 0.87434506, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89747643, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 4.187364339828491 + }, + { + "auxiliary_loss_clip": 0.01128968, + "auxiliary_loss_mlp": 0.01152339, + "balance_loss_clip": 1.00200069, + "balance_loss_mlp": 1.00089586, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.3446627545847814, + "language_loss": 0.72699249, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74980551, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.696767807006836 + }, + { + "auxiliary_loss_clip": 0.01144078, + "auxiliary_loss_mlp": 0.01152878, + "balance_loss_clip": 1.00202191, + "balance_loss_mlp": 1.00086188, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.5382250695498714, + "language_loss": 0.80133563, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82430518, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.6381192207336426 + }, + { + "auxiliary_loss_clip": 0.01177186, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00084627, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.748475224448254, + "language_loss": 0.73020995, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75350857, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.631892681121826 + }, + { + "auxiliary_loss_clip": 0.01143639, + "auxiliary_loss_mlp": 0.01152002, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.00122631, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 2.036949629281407, + "language_loss": 0.70558029, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72853667, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.592494249343872 + }, + { + "auxiliary_loss_clip": 0.01160332, + "auxiliary_loss_mlp": 0.00748629, + "balance_loss_clip": 1.00197113, + "balance_loss_mlp": 1.00079846, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 2.5642419134928947, + "language_loss": 0.79242671, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81151628, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 3.9010980129241943 + }, + { + "auxiliary_loss_clip": 0.01160454, + "auxiliary_loss_mlp": 0.00748594, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.00077724, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9618772581167274, + "language_loss": 0.86936432, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88845479, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.6444828510284424 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01152128, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00097108, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.7714027233677132, + "language_loss": 0.69115877, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.71414208, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.5717546939849854 + }, + { + "auxiliary_loss_clip": 0.01161421, + "auxiliary_loss_mlp": 0.01152597, + "balance_loss_clip": 1.00233579, + "balance_loss_mlp": 1.00086749, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7048997653954263, + "language_loss": 0.78699303, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81013316, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 4.069968223571777 + }, + { + "auxiliary_loss_clip": 0.0114489, + "auxiliary_loss_mlp": 0.01152637, + "balance_loss_clip": 1.0021404, + "balance_loss_mlp": 1.0009079, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7212160446748208, + "language_loss": 0.77249944, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79547471, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 4.074173212051392 + }, + { + "auxiliary_loss_clip": 0.01145811, + "auxiliary_loss_mlp": 0.01152787, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00096238, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 3.5399991796615087, + "language_loss": 0.81931275, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84229875, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.596031427383423 + }, + { + "auxiliary_loss_clip": 0.0116006, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.00108719, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.7449631590472383, + "language_loss": 0.67734742, + "learning_rate": 3.668530172166741e-06, + "loss": 0.70047903, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.555454730987549 + }, + { + "auxiliary_loss_clip": 0.01132498, + "auxiliary_loss_mlp": 0.01152705, + "balance_loss_clip": 1.00207937, + "balance_loss_mlp": 1.00097513, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.7688421254318374, + "language_loss": 0.80678976, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.8296417, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.6127583980560303 + }, + { + "auxiliary_loss_clip": 0.01160484, + "auxiliary_loss_mlp": 0.0115264, + "balance_loss_clip": 1.00212455, + "balance_loss_mlp": 1.00100565, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.594923208583268, + "language_loss": 0.78323668, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80636787, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.5763492584228516 + }, + { + "auxiliary_loss_clip": 0.01143749, + "auxiliary_loss_mlp": 0.01152742, + "balance_loss_clip": 1.00204051, + "balance_loss_mlp": 1.0010128, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.6106933984545628, + "language_loss": 0.7419489, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.7649138, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.6328954696655273 + }, + { + "auxiliary_loss_clip": 0.01160504, + "auxiliary_loss_mlp": 0.01151905, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00093889, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.5174835221170608, + "language_loss": 0.75193959, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77506363, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.53463077545166 + }, + { + "auxiliary_loss_clip": 0.01113652, + "auxiliary_loss_mlp": 0.01152106, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00075793, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.776021351935734, + "language_loss": 0.77343112, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7960887, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.731529951095581 + }, + { + "auxiliary_loss_clip": 0.01096452, + "auxiliary_loss_mlp": 0.0115335, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00095296, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.386363345664762, + "language_loss": 0.7859509, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80844891, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.6791975498199463 + }, + { + "auxiliary_loss_clip": 0.01128099, + "auxiliary_loss_mlp": 0.01153012, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.00109172, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 2.592955918409517, + "language_loss": 0.77027273, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79308391, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.6538710594177246 + }, + { + "auxiliary_loss_clip": 0.01145079, + "auxiliary_loss_mlp": 0.01152605, + "balance_loss_clip": 1.00213802, + "balance_loss_mlp": 1.00106645, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.8721033890301537, + "language_loss": 0.63959932, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66257614, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.6180152893066406 + }, + { + "auxiliary_loss_clip": 0.01160515, + "auxiliary_loss_mlp": 0.01152511, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00097227, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.7853420173501298, + "language_loss": 0.82247341, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84560364, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.6228084564208984 + }, + { + "auxiliary_loss_clip": 0.01160415, + "auxiliary_loss_mlp": 0.01152577, + "balance_loss_clip": 1.00210595, + "balance_loss_mlp": 1.0011338, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.7482068303566762, + "language_loss": 0.76081181, + "learning_rate": 3.666379660223824e-06, + "loss": 0.78394175, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.5521955490112305 + }, + { + "auxiliary_loss_clip": 0.01177207, + "auxiliary_loss_mlp": 0.0115252, + "balance_loss_clip": 1.00229454, + "balance_loss_mlp": 1.00098097, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.4352729786938228, + "language_loss": 0.85539591, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87869322, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.4872446060180664 + }, + { + "auxiliary_loss_clip": 0.01127167, + "auxiliary_loss_mlp": 0.0115227, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.0008266, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.7085113289864022, + "language_loss": 0.68049645, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70329082, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.7943432331085205 + }, + { + "auxiliary_loss_clip": 0.01177204, + "auxiliary_loss_mlp": 0.01152539, + "balance_loss_clip": 1.00224137, + "balance_loss_mlp": 1.00080907, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.7486347943371794, + "language_loss": 0.72125655, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74455398, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.649672508239746 + }, + { + "auxiliary_loss_clip": 0.01062813, + "auxiliary_loss_mlp": 0.01153257, + "balance_loss_clip": 1.00166821, + "balance_loss_mlp": 1.00114584, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3642465311197314, + "language_loss": 0.69515646, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71731716, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.7993085384368896 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01152543, + "balance_loss_clip": 1.00212145, + "balance_loss_mlp": 1.00100374, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 2.112737661071405, + "language_loss": 0.73409414, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75722438, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.715832471847534 + }, + { + "auxiliary_loss_clip": 0.01144754, + "auxiliary_loss_mlp": 0.01152117, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00086462, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.7958553629060956, + "language_loss": 0.74182367, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76479244, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.7372679710388184 + }, + { + "auxiliary_loss_clip": 0.01144552, + "auxiliary_loss_mlp": 0.01152395, + "balance_loss_clip": 1.00217009, + "balance_loss_mlp": 1.00066578, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.8943321430859001, + "language_loss": 0.76657706, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78954649, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.6488027572631836 + }, + { + "auxiliary_loss_clip": 0.0114509, + "auxiliary_loss_mlp": 0.01152646, + "balance_loss_clip": 1.00226128, + "balance_loss_mlp": 1.0011071, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.9386817026560985, + "language_loss": 0.6877414, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.71071881, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.5523784160614014 + }, + { + "auxiliary_loss_clip": 0.01126919, + "auxiliary_loss_mlp": 0.01153303, + "balance_loss_clip": 1.00179231, + "balance_loss_mlp": 1.00128698, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.843221197083526, + "language_loss": 0.85199356, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87479573, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.6064791679382324 + }, + { + "auxiliary_loss_clip": 0.01144711, + "auxiliary_loss_mlp": 0.01152217, + "balance_loss_clip": 1.00216222, + "balance_loss_mlp": 1.00115526, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 1.7764600835207354, + "language_loss": 0.62815523, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65112454, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.6794826984405518 + }, + { + "auxiliary_loss_clip": 0.01094811, + "auxiliary_loss_mlp": 0.01152584, + "balance_loss_clip": 1.00176191, + "balance_loss_mlp": 1.0012362, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8610358783447645, + "language_loss": 0.889337, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91181099, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.698910713195801 + }, + { + "auxiliary_loss_clip": 0.01145192, + "auxiliary_loss_mlp": 0.01152863, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00122833, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 2.122776446740882, + "language_loss": 0.8146866, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83766717, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.615004539489746 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01152131, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00106931, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.541896212919794, + "language_loss": 0.76286477, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78582287, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.598703145980835 + }, + { + "auxiliary_loss_clip": 0.01099634, + "auxiliary_loss_mlp": 0.01152211, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.0009582, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 1.9632059861474913, + "language_loss": 0.75640506, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77892351, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.686748504638672 + }, + { + "auxiliary_loss_clip": 0.01177189, + "auxiliary_loss_mlp": 0.01152918, + "balance_loss_clip": 1.00218642, + "balance_loss_mlp": 1.00137877, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.0454426337342118, + "language_loss": 0.70411116, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72741222, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01161923, + "auxiliary_loss_mlp": 0.01152503, + "balance_loss_clip": 1.00230885, + "balance_loss_mlp": 1.00125027, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.2639264649568522, + "language_loss": 0.77210808, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79525232, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.5140089988708496 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01152222, + "balance_loss_clip": 1.00202715, + "balance_loss_mlp": 1.00087392, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.372849823437295, + "language_loss": 0.81500965, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83797002, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 4.013273239135742 + }, + { + "auxiliary_loss_clip": 0.01097405, + "auxiliary_loss_mlp": 0.01152146, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00079823, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 1.7878332610373915, + "language_loss": 0.74905974, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77155524, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.726398229598999 + }, + { + "auxiliary_loss_clip": 0.01177235, + "auxiliary_loss_mlp": 0.01151776, + "balance_loss_clip": 1.00226259, + "balance_loss_mlp": 1.00090504, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.7744431142795276, + "language_loss": 0.77009821, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79338831, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.501214027404785 + }, + { + "auxiliary_loss_clip": 0.01177082, + "auxiliary_loss_mlp": 0.01152269, + "balance_loss_clip": 1.00218129, + "balance_loss_mlp": 1.00092101, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 2.3969584188084974, + "language_loss": 0.781753, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80504644, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.480938673019409 + }, + { + "auxiliary_loss_clip": 0.01160538, + "auxiliary_loss_mlp": 0.01152037, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00087965, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.75025503656131, + "language_loss": 0.8171131, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.84023881, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.5626413822174072 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.00748722, + "balance_loss_clip": 1.00201237, + "balance_loss_mlp": 1.00097013, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.3438043260613, + "language_loss": 0.77171201, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.79064566, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 4.06681227684021 + }, + { + "auxiliary_loss_clip": 0.01177207, + "auxiliary_loss_mlp": 0.01151822, + "balance_loss_clip": 1.00234902, + "balance_loss_mlp": 1.00095046, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 3.136939945135182, + "language_loss": 0.82937163, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85266197, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.535918951034546 + }, + { + "auxiliary_loss_clip": 0.01145236, + "auxiliary_loss_mlp": 0.01151976, + "balance_loss_clip": 1.00220013, + "balance_loss_mlp": 1.00091362, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 6.600812491963507, + "language_loss": 0.7350924, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75806451, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.543349266052246 + }, + { + "auxiliary_loss_clip": 0.01128852, + "auxiliary_loss_mlp": 0.01151906, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00093961, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.67912234071673, + "language_loss": 0.7394622, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76226979, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 4.087405443191528 + }, + { + "auxiliary_loss_clip": 0.01161216, + "auxiliary_loss_mlp": 0.01152519, + "balance_loss_clip": 1.00222468, + "balance_loss_mlp": 1.00098014, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.9174426359733696, + "language_loss": 0.7125001, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73563743, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.680563449859619 + }, + { + "auxiliary_loss_clip": 0.01143799, + "auxiliary_loss_mlp": 0.01152076, + "balance_loss_clip": 1.00211155, + "balance_loss_mlp": 1.00091863, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.7710714858003582, + "language_loss": 0.715819, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.7387777, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.5708529949188232 + }, + { + "auxiliary_loss_clip": 0.01160408, + "auxiliary_loss_mlp": 0.01151942, + "balance_loss_clip": 1.0021342, + "balance_loss_mlp": 1.00116634, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 2.0455833620061967, + "language_loss": 0.70433754, + "learning_rate": 3.660324636216996e-06, + "loss": 0.7274611, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.5855324268341064 + }, + { + "auxiliary_loss_clip": 0.01177097, + "auxiliary_loss_mlp": 0.01152178, + "balance_loss_clip": 1.00216973, + "balance_loss_mlp": 1.00102067, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9299200408615378, + "language_loss": 0.87725544, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90054816, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.510019063949585 + }, + { + "auxiliary_loss_clip": 0.0116044, + "auxiliary_loss_mlp": 0.00748561, + "balance_loss_clip": 1.00211918, + "balance_loss_mlp": 1.00080144, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 2.587093405053766, + "language_loss": 0.80390382, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82299387, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.587277412414551 + }, + { + "auxiliary_loss_clip": 0.01096466, + "auxiliary_loss_mlp": 0.01151368, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00087869, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.7223795989241308, + "language_loss": 0.87379766, + "learning_rate": 3.659672952835863e-06, + "loss": 0.896276, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.715026617050171 + }, + { + "auxiliary_loss_clip": 0.01144923, + "auxiliary_loss_mlp": 0.01151812, + "balance_loss_clip": 1.00222635, + "balance_loss_mlp": 1.00103569, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 2.721998016628887, + "language_loss": 0.577595, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60056233, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.5514304637908936 + }, + { + "auxiliary_loss_clip": 0.01177163, + "auxiliary_loss_mlp": 0.01151698, + "balance_loss_clip": 1.00230849, + "balance_loss_mlp": 1.00073147, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 1.8639895323743194, + "language_loss": 0.75822854, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78151721, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.494584083557129 + }, + { + "auxiliary_loss_clip": 0.01129904, + "auxiliary_loss_mlp": 0.0115213, + "balance_loss_clip": 1.00229132, + "balance_loss_mlp": 1.00106859, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9476119040087705, + "language_loss": 0.69769609, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.72051644, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.636082649230957 + }, + { + "auxiliary_loss_clip": 0.01176989, + "auxiliary_loss_mlp": 0.01151157, + "balance_loss_clip": 1.00225067, + "balance_loss_mlp": 1.00066757, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.875780757948157, + "language_loss": 0.76190656, + "learning_rate": 3.658803160610004e-06, + "loss": 0.78518802, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.541548013687134 + }, + { + "auxiliary_loss_clip": 0.01143724, + "auxiliary_loss_mlp": 0.01151647, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00096643, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.932734448020732, + "language_loss": 0.66672182, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68967557, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.5424153804779053 + }, + { + "auxiliary_loss_clip": 0.01145338, + "auxiliary_loss_mlp": 0.01152211, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00105381, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.7902252915503687, + "language_loss": 0.7099269, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73290241, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.6198341846466064 + }, + { + "auxiliary_loss_clip": 0.01143989, + "auxiliary_loss_mlp": 0.01152298, + "balance_loss_clip": 1.00194466, + "balance_loss_mlp": 1.00104547, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.5652641508986143, + "language_loss": 0.72329134, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74625421, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.658202886581421 + }, + { + "auxiliary_loss_clip": 0.01112437, + "auxiliary_loss_mlp": 0.01151995, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00102878, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 2.1221537679752394, + "language_loss": 0.80389452, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82653892, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.678473711013794 + }, + { + "auxiliary_loss_clip": 0.01177011, + "auxiliary_loss_mlp": 0.01151491, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00081027, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.762250232637132, + "language_loss": 0.7547152, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77800012, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 2.5451529026031494 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01152726, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00109148, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 1.8364633523471683, + "language_loss": 0.7440384, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76685226, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.6093602180480957 + }, + { + "auxiliary_loss_clip": 0.01128084, + "auxiliary_loss_mlp": 0.01151931, + "balance_loss_clip": 1.00197136, + "balance_loss_mlp": 1.00105989, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6522995090766543, + "language_loss": 0.80825996, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83106017, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.6280767917633057 + }, + { + "auxiliary_loss_clip": 0.01176932, + "auxiliary_loss_mlp": 0.01151041, + "balance_loss_clip": 1.00220263, + "balance_loss_mlp": 1.00093293, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.626273535363311, + "language_loss": 0.87928617, + "learning_rate": 3.657060557391621e-06, + "loss": 0.9025659, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.5150516033172607 + }, + { + "auxiliary_loss_clip": 0.01177002, + "auxiliary_loss_mlp": 0.01151559, + "balance_loss_clip": 1.00218618, + "balance_loss_mlp": 1.00097418, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.0433131075477817, + "language_loss": 0.83385217, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8571378, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.493581533432007 + }, + { + "auxiliary_loss_clip": 0.01161678, + "auxiliary_loss_mlp": 0.01151665, + "balance_loss_clip": 1.00220978, + "balance_loss_mlp": 1.00117564, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.6853347912152297, + "language_loss": 0.76693827, + "learning_rate": 3.656624278062713e-06, + "loss": 0.79007173, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.5700385570526123 + }, + { + "auxiliary_loss_clip": 0.01160255, + "auxiliary_loss_mlp": 0.01151028, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00111079, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.7008890363141558, + "language_loss": 0.7253679, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.7484808, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.5517523288726807 + }, + { + "auxiliary_loss_clip": 0.01113043, + "auxiliary_loss_mlp": 0.00748448, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.000947, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.926376099251292, + "language_loss": 0.68006915, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69868404, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.658622980117798 + }, + { + "auxiliary_loss_clip": 0.0112802, + "auxiliary_loss_mlp": 0.01151899, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00074196, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.9008531171053384, + "language_loss": 0.64909899, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67189819, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.6527795791625977 + }, + { + "auxiliary_loss_clip": 0.01160414, + "auxiliary_loss_mlp": 0.01151516, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00112128, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.730816308158042, + "language_loss": 0.72405612, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74717546, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.565474271774292 + }, + { + "auxiliary_loss_clip": 0.01145877, + "auxiliary_loss_mlp": 0.00748578, + "balance_loss_clip": 1.00229013, + "balance_loss_mlp": 1.00105524, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.975323209287676, + "language_loss": 0.6740936, + "learning_rate": 3.655532480546528e-06, + "loss": 0.69303811, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.6593105792999268 + }, + { + "auxiliary_loss_clip": 0.01177023, + "auxiliary_loss_mlp": 0.01151578, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00089741, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8007569945593283, + "language_loss": 0.79556656, + "learning_rate": 3.655313932676286e-06, + "loss": 0.8188526, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.4899349212646484 + }, + { + "auxiliary_loss_clip": 0.01176996, + "auxiliary_loss_mlp": 0.01151006, + "balance_loss_clip": 1.0021863, + "balance_loss_mlp": 1.00099325, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.5691510597341993, + "language_loss": 0.68178368, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70506358, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.5304534435272217 + }, + { + "auxiliary_loss_clip": 0.01160513, + "auxiliary_loss_mlp": 0.01151424, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00112534, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.9433785608670358, + "language_loss": 0.73385888, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75697821, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.6198935508728027 + }, + { + "auxiliary_loss_clip": 0.01146235, + "auxiliary_loss_mlp": 0.01151654, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.00106907, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.6332121818712984, + "language_loss": 0.77296162, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79594052, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 4.067061901092529 + }, + { + "auxiliary_loss_clip": 0.0117696, + "auxiliary_loss_mlp": 0.01150926, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.0008173, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.4902202060093552, + "language_loss": 0.84432095, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86759984, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.4989686012268066 + }, + { + "auxiliary_loss_clip": 0.01177097, + "auxiliary_loss_mlp": 0.01150986, + "balance_loss_clip": 1.00235069, + "balance_loss_mlp": 1.00097346, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.4776137825724787, + "language_loss": 0.7667042, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78998506, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.6363446712493896 + }, + { + "auxiliary_loss_clip": 0.01143579, + "auxiliary_loss_mlp": 0.01151163, + "balance_loss_clip": 1.00210464, + "balance_loss_mlp": 1.00095904, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.7244176603751802, + "language_loss": 0.88501441, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90796185, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.6501152515411377 + }, + { + "auxiliary_loss_clip": 0.01143843, + "auxiliary_loss_mlp": 0.01140555, + "balance_loss_clip": 1.00301313, + "balance_loss_mlp": 1.00093746, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8291477225323286, + "language_loss": 0.52235138, + "learning_rate": 3.653782340498215e-06, + "loss": 0.5451954, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.103675603866577 + }, + { + "auxiliary_loss_clip": 0.01160118, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.00073397, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.8197183382996944, + "language_loss": 0.67576098, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69886297, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.536696672439575 + }, + { + "auxiliary_loss_clip": 0.01145565, + "auxiliary_loss_mlp": 0.01150843, + "balance_loss_clip": 1.00227034, + "balance_loss_mlp": 1.00102067, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.586455609459789, + "language_loss": 0.7439912, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7669552, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 4.137593507766724 + }, + { + "auxiliary_loss_clip": 0.01160259, + "auxiliary_loss_mlp": 0.01151229, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00092959, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.700796431869289, + "language_loss": 0.77431887, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79743373, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.5549421310424805 + }, + { + "auxiliary_loss_clip": 0.01161763, + "auxiliary_loss_mlp": 0.01151647, + "balance_loss_clip": 1.00240135, + "balance_loss_mlp": 1.00087094, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.422495953494989, + "language_loss": 0.70288306, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72601724, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 4.0046350955963135 + }, + { + "auxiliary_loss_clip": 0.01177097, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_clip": 1.00222468, + "balance_loss_mlp": 1.00105834, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.2651423770832197, + "language_loss": 0.78902745, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.81230915, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 3.9570603370666504 + }, + { + "auxiliary_loss_clip": 0.01161453, + "auxiliary_loss_mlp": 0.01151603, + "balance_loss_clip": 1.00228679, + "balance_loss_mlp": 1.00082731, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 1.9782742938077875, + "language_loss": 0.82739449, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85052508, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.496964454650879 + }, + { + "auxiliary_loss_clip": 0.0114378, + "auxiliary_loss_mlp": 0.0115129, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00080061, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.4962072933315076, + "language_loss": 0.65542579, + "learning_rate": 3.652247675452598e-06, + "loss": 0.6783765, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.6005122661590576 + }, + { + "auxiliary_loss_clip": 0.01176864, + "auxiliary_loss_mlp": 0.01150701, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.00087905, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.075441768675498, + "language_loss": 0.75478292, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77805853, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.5059752464294434 + }, + { + "auxiliary_loss_clip": 0.01160247, + "auxiliary_loss_mlp": 0.01151062, + "balance_loss_clip": 1.00215888, + "balance_loss_mlp": 1.00076258, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.905604819363467, + "language_loss": 0.72298527, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.7460984, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.5182294845581055 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01150497, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00086582, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 2.9008690906223866, + "language_loss": 0.68382919, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.7067768, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.5841941833496094 + }, + { + "auxiliary_loss_clip": 0.01161432, + "auxiliary_loss_mlp": 0.01151142, + "balance_loss_clip": 1.00232184, + "balance_loss_mlp": 1.00065184, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.2204272771575866, + "language_loss": 0.88584661, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90897232, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.602515935897827 + }, + { + "auxiliary_loss_clip": 0.01145047, + "auxiliary_loss_mlp": 0.01141594, + "balance_loss_clip": 1.00286031, + "balance_loss_mlp": 1.00121295, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8213297494325154, + "language_loss": 0.56231833, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58518469, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.127119302749634 + }, + { + "auxiliary_loss_clip": 0.01161657, + "auxiliary_loss_mlp": 0.00748572, + "balance_loss_clip": 1.00232935, + "balance_loss_mlp": 1.00112677, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6498984701388661, + "language_loss": 0.88577634, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90487862, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.616429090499878 + }, + { + "auxiliary_loss_clip": 0.01161537, + "auxiliary_loss_mlp": 0.01150685, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00067234, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.6308335943906729, + "language_loss": 0.78080344, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80392563, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.5810232162475586 + }, + { + "auxiliary_loss_clip": 0.01160876, + "auxiliary_loss_mlp": 0.0115057, + "balance_loss_clip": 1.0021106, + "balance_loss_mlp": 1.00093865, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 2.089136171937011, + "language_loss": 0.72723359, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75034809, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.5903334617614746 + }, + { + "auxiliary_loss_clip": 0.0116027, + "auxiliary_loss_mlp": 0.01150983, + "balance_loss_clip": 1.00220597, + "balance_loss_mlp": 1.00077915, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.328282912287792, + "language_loss": 0.70761764, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73073018, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.6203620433807373 + }, + { + "auxiliary_loss_clip": 0.01176839, + "auxiliary_loss_mlp": 0.01150908, + "balance_loss_clip": 1.00214314, + "balance_loss_mlp": 1.00089502, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 12.732565822279222, + "language_loss": 0.84799671, + "learning_rate": 3.650049971985889e-06, + "loss": 0.87127411, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.469569683074951 + }, + { + "auxiliary_loss_clip": 0.01145002, + "auxiliary_loss_mlp": 0.01151234, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00103045, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.2223935602409637, + "language_loss": 0.83436418, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85732651, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.603900909423828 + }, + { + "auxiliary_loss_clip": 0.01128201, + "auxiliary_loss_mlp": 0.00748583, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00117934, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 1.8897662229375625, + "language_loss": 0.90147305, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92024088, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.6006758213043213 + }, + { + "auxiliary_loss_clip": 0.01159848, + "auxiliary_loss_mlp": 0.01150616, + "balance_loss_clip": 1.00218832, + "balance_loss_mlp": 1.00088954, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.0756625241338704, + "language_loss": 0.74366605, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76677072, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01113626, + "auxiliary_loss_mlp": 0.01151461, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.00106692, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 1.9114263518314503, + "language_loss": 0.83195281, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85460365, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.6641271114349365 + }, + { + "auxiliary_loss_clip": 0.01111889, + "auxiliary_loss_mlp": 0.00748631, + "balance_loss_clip": 1.00182474, + "balance_loss_mlp": 1.00114274, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.7910111030671694, + "language_loss": 0.75957453, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77817976, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.7189855575561523 + }, + { + "auxiliary_loss_clip": 0.01161516, + "auxiliary_loss_mlp": 0.01151036, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.0008328, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8645663203121712, + "language_loss": 0.81101817, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.8341437, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.617335081100464 + }, + { + "auxiliary_loss_clip": 0.01176918, + "auxiliary_loss_mlp": 0.0115069, + "balance_loss_clip": 1.00225186, + "balance_loss_mlp": 1.0007726, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 3.046722917996377, + "language_loss": 0.72795665, + "learning_rate": 3.648507856144961e-06, + "loss": 0.7512328, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.515505790710449 + }, + { + "auxiliary_loss_clip": 0.01144255, + "auxiliary_loss_mlp": 0.01151063, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00076365, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 1.7158226068117124, + "language_loss": 0.8405112, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86346436, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.568528652191162 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01151522, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00093627, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 1.733625232092067, + "language_loss": 0.68871439, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71150023, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.654597759246826 + }, + { + "auxiliary_loss_clip": 0.01128279, + "auxiliary_loss_mlp": 0.01150881, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00086761, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.7157879233428486, + "language_loss": 0.8411634, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86395502, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.629910469055176 + }, + { + "auxiliary_loss_clip": 0.01128057, + "auxiliary_loss_mlp": 0.01151087, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00078773, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.5079779591029436, + "language_loss": 0.75721431, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.78000581, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.6519370079040527 + }, + { + "auxiliary_loss_clip": 0.01160167, + "auxiliary_loss_mlp": 0.01150263, + "balance_loss_clip": 1.00222707, + "balance_loss_mlp": 1.00072646, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.6469103538379333, + "language_loss": 0.80700016, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.83010447, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.5756185054779053 + }, + { + "auxiliary_loss_clip": 0.01126933, + "auxiliary_loss_mlp": 0.01151026, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00082278, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.09925818178435, + "language_loss": 0.78884757, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81162715, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.612048864364624 + }, + { + "auxiliary_loss_clip": 0.01094697, + "auxiliary_loss_mlp": 0.01150649, + "balance_loss_clip": 1.00201476, + "balance_loss_mlp": 1.00082695, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.6908839488853653, + "language_loss": 0.83125407, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85370749, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.7159323692321777 + }, + { + "auxiliary_loss_clip": 0.01144781, + "auxiliary_loss_mlp": 0.00748696, + "balance_loss_clip": 1.002092, + "balance_loss_mlp": 1.00102401, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.7229315581622722, + "language_loss": 0.80758715, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82652193, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.576092481613159 + }, + { + "auxiliary_loss_clip": 0.01145114, + "auxiliary_loss_mlp": 0.01151231, + "balance_loss_clip": 1.00215495, + "balance_loss_mlp": 1.00083685, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 2.7100455422610077, + "language_loss": 0.81778646, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84074998, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 4.037790536880493 + }, + { + "auxiliary_loss_clip": 0.01128093, + "auxiliary_loss_mlp": 0.00748608, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.0009352, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 1.8167319324533562, + "language_loss": 0.76379478, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78256178, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.6054227352142334 + }, + { + "auxiliary_loss_clip": 0.01129689, + "auxiliary_loss_mlp": 0.01151155, + "balance_loss_clip": 1.00227928, + "balance_loss_mlp": 1.00095177, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.8735735532003222, + "language_loss": 0.80348992, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82629836, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.6332621574401855 + }, + { + "auxiliary_loss_clip": 0.01176846, + "auxiliary_loss_mlp": 0.01150626, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00109005, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 2.044949222923698, + "language_loss": 0.83253217, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85580683, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.4979336261749268 + }, + { + "auxiliary_loss_clip": 0.01176941, + "auxiliary_loss_mlp": 0.01151045, + "balance_loss_clip": 1.00228775, + "balance_loss_mlp": 1.00093675, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.7448783772089862, + "language_loss": 0.74664527, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76992506, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.497657299041748 + }, + { + "auxiliary_loss_clip": 0.0114578, + "auxiliary_loss_mlp": 0.01150747, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.00073397, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.696781126883652, + "language_loss": 0.7444067, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76737195, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.5692293643951416 + }, + { + "auxiliary_loss_clip": 0.01160075, + "auxiliary_loss_mlp": 0.01150646, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00091946, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.7739964002696278, + "language_loss": 0.80100572, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82411289, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.5709617137908936 + }, + { + "auxiliary_loss_clip": 0.01175631, + "auxiliary_loss_mlp": 0.01141482, + "balance_loss_clip": 1.00269079, + "balance_loss_mlp": 1.00186443, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.7037729298935829, + "language_loss": 0.58363837, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.6068095, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 4.580142974853516 + }, + { + "auxiliary_loss_clip": 0.01177033, + "auxiliary_loss_mlp": 0.01151045, + "balance_loss_clip": 1.00230634, + "balance_loss_mlp": 1.00074601, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 8.298703074739002, + "language_loss": 0.73401868, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75729948, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.5090816020965576 + }, + { + "auxiliary_loss_clip": 0.01143577, + "auxiliary_loss_mlp": 0.01150764, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00075138, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 1.9594946029168525, + "language_loss": 0.76860297, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.79154634, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.5315566062927246 + }, + { + "auxiliary_loss_clip": 0.01094965, + "auxiliary_loss_mlp": 0.01151092, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.00088799, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.7909147784887223, + "language_loss": 0.73971182, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76217234, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 5.4560840129852295 + }, + { + "auxiliary_loss_clip": 0.01148752, + "auxiliary_loss_mlp": 0.01150586, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00095403, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 2.4140092682553376, + "language_loss": 0.88719332, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91018665, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.545257806777954 + }, + { + "auxiliary_loss_clip": 0.01176902, + "auxiliary_loss_mlp": 0.01150696, + "balance_loss_clip": 1.00228357, + "balance_loss_mlp": 1.00077796, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.6797292995827053, + "language_loss": 0.77647603, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.799752, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.4887588024139404 + }, + { + "auxiliary_loss_clip": 0.01096672, + "auxiliary_loss_mlp": 0.01150344, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.0009985, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 1.8217664492314811, + "language_loss": 0.63496697, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65743715, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.6687915325164795 + }, + { + "auxiliary_loss_clip": 0.01113495, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_clip": 1.00195706, + "balance_loss_mlp": 1.00074267, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.7133981581824003, + "language_loss": 0.75777304, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78041363, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.6296584606170654 + }, + { + "auxiliary_loss_clip": 0.01128155, + "auxiliary_loss_mlp": 0.01151184, + "balance_loss_clip": 1.00207698, + "balance_loss_mlp": 1.0009805, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 1.8841960409490843, + "language_loss": 0.71419644, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73698986, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.5795154571533203 + }, + { + "auxiliary_loss_clip": 0.01160081, + "auxiliary_loss_mlp": 0.01150674, + "balance_loss_clip": 1.00208461, + "balance_loss_mlp": 1.00113761, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.6828722227309165, + "language_loss": 0.73736024, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.76046777, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.524080753326416 + }, + { + "auxiliary_loss_clip": 0.01161561, + "auxiliary_loss_mlp": 0.01151284, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00079393, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 3.5394817521776933, + "language_loss": 0.9032566, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92638499, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.5366060733795166 + }, + { + "auxiliary_loss_clip": 0.01112722, + "auxiliary_loss_mlp": 0.01150885, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00068116, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 3.1172987131391565, + "language_loss": 0.81366086, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83629692, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.5930802822113037 + }, + { + "auxiliary_loss_clip": 0.01143303, + "auxiliary_loss_mlp": 0.01150456, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00092006, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.9770488044164083, + "language_loss": 0.75985706, + "learning_rate": 3.642308790849329e-06, + "loss": 0.78279459, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.5994436740875244 + }, + { + "auxiliary_loss_clip": 0.01161476, + "auxiliary_loss_mlp": 0.0115134, + "balance_loss_clip": 1.00219405, + "balance_loss_mlp": 1.00113583, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.2644871420771726, + "language_loss": 0.69447565, + "learning_rate": 3.642086491552996e-06, + "loss": 0.7176038, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.4996283054351807 + }, + { + "auxiliary_loss_clip": 0.01160048, + "auxiliary_loss_mlp": 0.01150853, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00103116, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 2.604816953480036, + "language_loss": 0.78335756, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80646658, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.5456695556640625 + }, + { + "auxiliary_loss_clip": 0.01176795, + "auxiliary_loss_mlp": 0.0115034, + "balance_loss_clip": 1.00222361, + "balance_loss_mlp": 1.00099468, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.2877611266855142, + "language_loss": 0.79749107, + "learning_rate": 3.641641706164509e-06, + "loss": 0.8207624, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.484778642654419 + }, + { + "auxiliary_loss_clip": 0.01161214, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00088549, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 2.099599551961746, + "language_loss": 0.87684608, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89996147, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.552656412124634 + }, + { + "auxiliary_loss_clip": 0.0116159, + "auxiliary_loss_mlp": 0.01151002, + "balance_loss_clip": 1.00225854, + "balance_loss_mlp": 1.00079846, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 2.1682002341911213, + "language_loss": 0.76981044, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79293638, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.5035667419433594 + }, + { + "auxiliary_loss_clip": 0.01131837, + "auxiliary_loss_mlp": 0.01150852, + "balance_loss_clip": 1.00212371, + "balance_loss_mlp": 1.00083923, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 1.8175625610205088, + "language_loss": 0.84848273, + "learning_rate": 3.640974061218741e-06, + "loss": 0.87130964, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.5760629177093506 + }, + { + "auxiliary_loss_clip": 0.01161579, + "auxiliary_loss_mlp": 0.01151593, + "balance_loss_clip": 1.00236511, + "balance_loss_mlp": 1.00148451, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.7854078224034278, + "language_loss": 0.76896095, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79209268, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.534271717071533 + }, + { + "auxiliary_loss_clip": 0.01160526, + "auxiliary_loss_mlp": 0.01140463, + "balance_loss_clip": 1.00269866, + "balance_loss_mlp": 1.0008446, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.818812499025305, + "language_loss": 0.60695928, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62996918, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.204087734222412 + }, + { + "auxiliary_loss_clip": 0.01144598, + "auxiliary_loss_mlp": 0.00748506, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00076687, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.822261020786287, + "language_loss": 0.90403241, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92296338, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.6237289905548096 + }, + { + "auxiliary_loss_clip": 0.01118121, + "auxiliary_loss_mlp": 0.01150998, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00069928, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 2.3573915759229904, + "language_loss": 0.73453349, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75722468, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.6990668773651123 + }, + { + "auxiliary_loss_clip": 0.01176737, + "auxiliary_loss_mlp": 0.01150359, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00082302, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 2.0002326961596184, + "language_loss": 0.77169871, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79496968, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.524373769760132 + }, + { + "auxiliary_loss_clip": 0.01160191, + "auxiliary_loss_mlp": 0.01150772, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00095022, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.7362806807147133, + "language_loss": 0.7117725, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73488212, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.618074417114258 + }, + { + "auxiliary_loss_clip": 0.01096817, + "auxiliary_loss_mlp": 0.01149757, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00079358, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 3.04781459179114, + "language_loss": 0.76559156, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78805733, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.713192939758301 + }, + { + "auxiliary_loss_clip": 0.01176954, + "auxiliary_loss_mlp": 0.01150669, + "balance_loss_clip": 1.00230169, + "balance_loss_mlp": 1.00103736, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.13105115631614, + "language_loss": 0.75304234, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77631861, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.4799859523773193 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01149788, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00091982, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.232773489445708, + "language_loss": 0.83667397, + "learning_rate": 3.638967767095249e-06, + "loss": 0.85993844, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.468926191329956 + }, + { + "auxiliary_loss_clip": 0.01133179, + "auxiliary_loss_mlp": 0.01150818, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.0009954, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.9334649945645992, + "language_loss": 0.81538236, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83822232, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.6063945293426514 + }, + { + "auxiliary_loss_clip": 0.01159563, + "auxiliary_loss_mlp": 0.01150453, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00082135, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.0774654654188933, + "language_loss": 0.74807459, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77117479, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.4981653690338135 + }, + { + "auxiliary_loss_clip": 0.01144727, + "auxiliary_loss_mlp": 0.01150079, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00092411, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 7.1536803660399935, + "language_loss": 0.88440382, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90735185, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.561868190765381 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.00748486, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00067842, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.106529940278667, + "language_loss": 0.75557601, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77434075, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 4.08984112739563 + }, + { + "auxiliary_loss_clip": 0.01143709, + "auxiliary_loss_mlp": 0.01151016, + "balance_loss_clip": 1.00200832, + "balance_loss_mlp": 1.00081241, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 3.3155416299573273, + "language_loss": 0.89514387, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91809106, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.5010294914245605 + }, + { + "auxiliary_loss_clip": 0.01160022, + "auxiliary_loss_mlp": 0.01150183, + "balance_loss_clip": 1.00202477, + "balance_loss_mlp": 1.0010283, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.605195686131748, + "language_loss": 0.89924419, + "learning_rate": 3.637627440557275e-06, + "loss": 0.92234623, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.4672842025756836 + }, + { + "auxiliary_loss_clip": 0.01144529, + "auxiliary_loss_mlp": 0.00748542, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00075698, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 2.8913004805204108, + "language_loss": 0.79612726, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81505799, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.604017972946167 + }, + { + "auxiliary_loss_clip": 0.01161397, + "auxiliary_loss_mlp": 0.0115102, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00110281, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 1.9928398349621177, + "language_loss": 0.71992934, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74305356, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.4939098358154297 + }, + { + "auxiliary_loss_clip": 0.01144736, + "auxiliary_loss_mlp": 0.01150751, + "balance_loss_clip": 1.00226486, + "balance_loss_mlp": 1.0009284, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 1.8985859413893418, + "language_loss": 0.80922145, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83217627, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.5594964027404785 + }, + { + "auxiliary_loss_clip": 0.01161464, + "auxiliary_loss_mlp": 0.01150479, + "balance_loss_clip": 1.00225592, + "balance_loss_mlp": 1.00084722, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.7958384371691591, + "language_loss": 0.7174601, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74057955, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 3.921323299407959 + }, + { + "auxiliary_loss_clip": 0.0117681, + "auxiliary_loss_mlp": 0.01150704, + "balance_loss_clip": 1.00223958, + "balance_loss_mlp": 1.00097728, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.9534518600706658, + "language_loss": 0.68060869, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70388383, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.73203444480896 + }, + { + "auxiliary_loss_clip": 0.01176804, + "auxiliary_loss_mlp": 0.01151256, + "balance_loss_clip": 1.0021559, + "balance_loss_mlp": 1.00095665, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.2569464959212744, + "language_loss": 0.78121251, + "learning_rate": 3.636284878455669e-06, + "loss": 0.80449319, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.492050886154175 + }, + { + "auxiliary_loss_clip": 0.01159991, + "auxiliary_loss_mlp": 0.01150918, + "balance_loss_clip": 1.00215364, + "balance_loss_mlp": 1.00128651, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.6131105287998369, + "language_loss": 0.826029, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84913814, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 4.0193469524383545 + }, + { + "auxiliary_loss_clip": 0.01161248, + "auxiliary_loss_mlp": 0.01150532, + "balance_loss_clip": 1.00225472, + "balance_loss_mlp": 1.00090075, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 2.6634962411363023, + "language_loss": 0.83034712, + "learning_rate": 3.635836861279901e-06, + "loss": 0.8534649, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 3.9106099605560303 + }, + { + "auxiliary_loss_clip": 0.01176542, + "auxiliary_loss_mlp": 0.01150514, + "balance_loss_clip": 1.00202, + "balance_loss_mlp": 1.00107312, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.6667512835323857, + "language_loss": 0.72408259, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74735314, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.543945074081421 + }, + { + "auxiliary_loss_clip": 0.0112818, + "auxiliary_loss_mlp": 0.01150736, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00081837, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.7739725382597555, + "language_loss": 0.74111998, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76390916, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.5622568130493164 + }, + { + "auxiliary_loss_clip": 0.01159902, + "auxiliary_loss_mlp": 0.011502, + "balance_loss_clip": 1.0020889, + "balance_loss_mlp": 1.00104511, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.8981851673579033, + "language_loss": 0.86430728, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88740832, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.4926087856292725 + }, + { + "auxiliary_loss_clip": 0.01144117, + "auxiliary_loss_mlp": 0.01150444, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00100291, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.1029461763678703, + "language_loss": 0.83982754, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.86277318, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.5827722549438477 + }, + { + "auxiliary_loss_clip": 0.01161207, + "auxiliary_loss_mlp": 0.01150321, + "balance_loss_clip": 1.00207067, + "balance_loss_mlp": 1.00078487, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8447729792751786, + "language_loss": 0.74876404, + "learning_rate": 3.634715732945027e-06, + "loss": 0.77187932, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.504049777984619 + }, + { + "auxiliary_loss_clip": 0.01110817, + "auxiliary_loss_mlp": 0.01139747, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00089192, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7371251050532491, + "language_loss": 0.5153749, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53788054, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.2049150466918945 + }, + { + "auxiliary_loss_clip": 0.01144926, + "auxiliary_loss_mlp": 0.01150945, + "balance_loss_clip": 1.00236261, + "balance_loss_mlp": 1.00093246, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.943025404324571, + "language_loss": 0.75656825, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77952695, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.602917194366455 + }, + { + "auxiliary_loss_clip": 0.0116572, + "auxiliary_loss_mlp": 0.01150957, + "balance_loss_clip": 1.00240469, + "balance_loss_mlp": 1.00094438, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.8673675502701808, + "language_loss": 0.72973979, + "learning_rate": 3.634042312013064e-06, + "loss": 0.75290656, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.512824773788452 + }, + { + "auxiliary_loss_clip": 0.0114536, + "auxiliary_loss_mlp": 0.01150073, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00091791, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.5744824468299565, + "language_loss": 0.80625594, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.82921034, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.57698655128479 + }, + { + "auxiliary_loss_clip": 0.01127981, + "auxiliary_loss_mlp": 0.00748561, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.00104141, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.437871726435862, + "language_loss": 0.84462517, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86339056, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.579223871231079 + }, + { + "auxiliary_loss_clip": 0.01160513, + "auxiliary_loss_mlp": 0.01151056, + "balance_loss_clip": 1.00225306, + "balance_loss_mlp": 1.00085211, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.6378847772969076, + "language_loss": 0.8039993, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82711494, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.6116321086883545 + }, + { + "auxiliary_loss_clip": 0.01127543, + "auxiliary_loss_mlp": 0.01139297, + "balance_loss_clip": 1.00222516, + "balance_loss_mlp": 1.00044239, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7769275542935277, + "language_loss": 0.58164334, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.6043117, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.2794859409332275 + }, + { + "auxiliary_loss_clip": 0.01129186, + "auxiliary_loss_mlp": 0.0115049, + "balance_loss_clip": 1.00196421, + "balance_loss_mlp": 1.00085819, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.211631248667372, + "language_loss": 0.74917555, + "learning_rate": 3.632918704645772e-06, + "loss": 0.7719723, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.61710262298584 + }, + { + "auxiliary_loss_clip": 0.01160148, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_clip": 1.00213933, + "balance_loss_mlp": 1.00082815, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.6519777652853675, + "language_loss": 0.81152225, + "learning_rate": 3.632693797376893e-06, + "loss": 0.8346312, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.551546573638916 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01150466, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00093031, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.7593211704177854, + "language_loss": 0.73508793, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75804746, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.6175732612609863 + }, + { + "auxiliary_loss_clip": 0.01144107, + "auxiliary_loss_mlp": 0.0115034, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00118518, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6141761840591902, + "language_loss": 0.78432959, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80727398, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.6013100147247314 + }, + { + "auxiliary_loss_clip": 0.01161389, + "auxiliary_loss_mlp": 0.01151137, + "balance_loss_clip": 1.00242794, + "balance_loss_mlp": 1.00102901, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.692069206843141, + "language_loss": 0.80596721, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82909244, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.543118476867676 + }, + { + "auxiliary_loss_clip": 0.01142955, + "auxiliary_loss_mlp": 0.01150819, + "balance_loss_clip": 1.00200665, + "balance_loss_mlp": 1.0009017, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.7636399538933007, + "language_loss": 0.77230716, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.79524493, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.5231642723083496 + }, + { + "auxiliary_loss_clip": 0.01143803, + "auxiliary_loss_mlp": 0.01151509, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00140047, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.5918886237189778, + "language_loss": 0.97771853, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00067163, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01160039, + "auxiliary_loss_mlp": 0.00748715, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00104725, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.6069062463341308, + "language_loss": 0.80429572, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82338321, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.7010040283203125 + }, + { + "auxiliary_loss_clip": 0.01160143, + "auxiliary_loss_mlp": 0.01151116, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00110292, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.614822712850057, + "language_loss": 0.7734645, + "learning_rate": 3.631117713439087e-06, + "loss": 0.7965771, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 2.51393461227417 + }, + { + "auxiliary_loss_clip": 0.01160536, + "auxiliary_loss_mlp": 0.0115038, + "balance_loss_clip": 1.00239611, + "balance_loss_mlp": 1.00093925, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.5907680302766698, + "language_loss": 0.71177119, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73488033, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.623826503753662 + }, + { + "auxiliary_loss_clip": 0.01176718, + "auxiliary_loss_mlp": 0.01149939, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.0007844, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 1.6540466416187196, + "language_loss": 0.85439491, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87766147, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.4933810234069824 + }, + { + "auxiliary_loss_clip": 0.01145433, + "auxiliary_loss_mlp": 0.0115059, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.0009582, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.6906039924625154, + "language_loss": 0.77161205, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79457223, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.671924114227295 + }, + { + "auxiliary_loss_clip": 0.0114383, + "auxiliary_loss_mlp": 0.0115066, + "balance_loss_clip": 1.00226963, + "balance_loss_mlp": 1.00093305, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 1.847189318899409, + "language_loss": 0.81088501, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83382994, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.532127857208252 + }, + { + "auxiliary_loss_clip": 0.01160806, + "auxiliary_loss_mlp": 0.01150474, + "balance_loss_clip": 1.00218058, + "balance_loss_mlp": 1.00112891, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8378331777831194, + "language_loss": 0.73277169, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75588453, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.634481191635132 + }, + { + "auxiliary_loss_clip": 0.01129878, + "auxiliary_loss_mlp": 0.01150421, + "balance_loss_clip": 1.00224507, + "balance_loss_mlp": 1.0006938, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9872286745345045, + "language_loss": 0.76548898, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78829193, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.7120749950408936 + }, + { + "auxiliary_loss_clip": 0.01176739, + "auxiliary_loss_mlp": 0.01150278, + "balance_loss_clip": 1.00231338, + "balance_loss_mlp": 1.0010283, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0207442469166756, + "language_loss": 0.74657416, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76984435, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 3.959557056427002 + }, + { + "auxiliary_loss_clip": 0.01176739, + "auxiliary_loss_mlp": 0.01150796, + "balance_loss_clip": 1.00224435, + "balance_loss_mlp": 1.0009743, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.8080930401450162, + "language_loss": 0.79955566, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82283098, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.5732200145721436 + }, + { + "auxiliary_loss_clip": 0.01143334, + "auxiliary_loss_mlp": 0.01150537, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00109649, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 2.0766577429638233, + "language_loss": 0.75360429, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77654302, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.547255277633667 + }, + { + "auxiliary_loss_clip": 0.01127999, + "auxiliary_loss_mlp": 0.01150392, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00095081, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.9891809250878791, + "language_loss": 0.83292532, + "learning_rate": 3.628860908251712e-06, + "loss": 0.8557092, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.597195863723755 + }, + { + "auxiliary_loss_clip": 0.01112739, + "auxiliary_loss_mlp": 0.01149963, + "balance_loss_clip": 1.00213313, + "balance_loss_mlp": 1.00090396, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 2.027708674751371, + "language_loss": 0.88724434, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.90987134, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.664740800857544 + }, + { + "auxiliary_loss_clip": 0.01160934, + "auxiliary_loss_mlp": 0.01151573, + "balance_loss_clip": 1.00235081, + "balance_loss_mlp": 1.00117922, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.256746820086439, + "language_loss": 0.86714029, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.89026535, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.500927686691284 + }, + { + "auxiliary_loss_clip": 0.01126891, + "auxiliary_loss_mlp": 0.01150574, + "balance_loss_clip": 1.00192106, + "balance_loss_mlp": 1.00141931, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.062249528951966, + "language_loss": 0.81588185, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83865649, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.5888664722442627 + }, + { + "auxiliary_loss_clip": 0.01176574, + "auxiliary_loss_mlp": 0.00748738, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00126302, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.666082589221942, + "language_loss": 0.79612654, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81537962, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 3.8814737796783447 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01150576, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00094414, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.884574792876716, + "language_loss": 0.77405572, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79702115, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.5869524478912354 + }, + { + "auxiliary_loss_clip": 0.01144602, + "auxiliary_loss_mlp": 0.01150496, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00096023, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.070026371100547, + "language_loss": 0.72636008, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74931109, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 4.009148836135864 + }, + { + "auxiliary_loss_clip": 0.01096008, + "auxiliary_loss_mlp": 0.01150216, + "balance_loss_clip": 1.00182521, + "balance_loss_mlp": 1.00096583, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9668588441331099, + "language_loss": 0.80309796, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82556021, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 4.053227186203003 + }, + { + "auxiliary_loss_clip": 0.01176576, + "auxiliary_loss_mlp": 0.01150343, + "balance_loss_clip": 1.0022136, + "balance_loss_mlp": 1.00128341, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.4162505833257646, + "language_loss": 0.87341803, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89668727, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.546412467956543 + }, + { + "auxiliary_loss_clip": 0.01159895, + "auxiliary_loss_mlp": 0.01149901, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00084186, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 2.0071211335533206, + "language_loss": 0.77494001, + "learning_rate": 3.626824502298707e-06, + "loss": 0.79803789, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.523662805557251 + }, + { + "auxiliary_loss_clip": 0.01145654, + "auxiliary_loss_mlp": 0.01150992, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00126553, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8409736973008846, + "language_loss": 0.84855676, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87152314, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.681948661804199 + }, + { + "auxiliary_loss_clip": 0.01112688, + "auxiliary_loss_mlp": 0.01150634, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.0009073, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.3559158123517614, + "language_loss": 0.81392825, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83656156, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.6868538856506348 + }, + { + "auxiliary_loss_clip": 0.0114298, + "auxiliary_loss_mlp": 0.0115051, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00106895, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.9993581430252627, + "language_loss": 0.70361805, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72655296, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.602917432785034 + }, + { + "auxiliary_loss_clip": 0.01160694, + "auxiliary_loss_mlp": 0.00749008, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00158024, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.8997227969446835, + "language_loss": 0.72266471, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74176174, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.5731046199798584 + }, + { + "auxiliary_loss_clip": 0.01160101, + "auxiliary_loss_mlp": 0.01150423, + "balance_loss_clip": 1.00229132, + "balance_loss_mlp": 1.00107741, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.158697457050111, + "language_loss": 0.71816111, + "learning_rate": 3.625691006130477e-06, + "loss": 0.74126637, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.5903267860412598 + }, + { + "auxiliary_loss_clip": 0.01165595, + "auxiliary_loss_mlp": 0.01150399, + "balance_loss_clip": 1.00234723, + "balance_loss_mlp": 1.00105405, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.66855736161831, + "language_loss": 0.87433821, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89749813, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.536911725997925 + }, + { + "auxiliary_loss_clip": 0.01160563, + "auxiliary_loss_mlp": 0.01149382, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.00079918, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 2.293229007955109, + "language_loss": 0.85907012, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.8821696, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.4872992038726807 + }, + { + "auxiliary_loss_clip": 0.01129331, + "auxiliary_loss_mlp": 0.01150719, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00089669, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9868422747833285, + "language_loss": 0.69339144, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71619189, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.5942580699920654 + }, + { + "auxiliary_loss_clip": 0.01144607, + "auxiliary_loss_mlp": 0.01149421, + "balance_loss_clip": 1.00201106, + "balance_loss_mlp": 1.00093389, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4539648001224572, + "language_loss": 0.71830153, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.74124181, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.622394323348999 + }, + { + "auxiliary_loss_clip": 0.01159779, + "auxiliary_loss_mlp": 0.01149916, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.00076139, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 23.566824959223627, + "language_loss": 0.87778413, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90088105, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.553718090057373 + }, + { + "auxiliary_loss_clip": 0.01144598, + "auxiliary_loss_mlp": 0.01149186, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00098503, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.4866874974512674, + "language_loss": 0.65898782, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68192565, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.7293365001678467 + }, + { + "auxiliary_loss_clip": 0.01159984, + "auxiliary_loss_mlp": 0.01150607, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.0010705, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9454061697572176, + "language_loss": 0.82666373, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84976959, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.636019706726074 + }, + { + "auxiliary_loss_clip": 0.01145922, + "auxiliary_loss_mlp": 0.01150028, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00106382, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6644206691341104, + "language_loss": 0.79691327, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81987274, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.5719618797302246 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.0115033, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00079405, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 1.9294019680794092, + "language_loss": 0.71962297, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74230433, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.8480029106140137 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.01149925, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.0008657, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 2.8767224449818745, + "language_loss": 0.79796183, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.8211149, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.548921823501587 + }, + { + "auxiliary_loss_clip": 0.01159632, + "auxiliary_loss_mlp": 0.01149556, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00078285, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.0264369541367815, + "language_loss": 0.78826863, + "learning_rate": 3.623191891195723e-06, + "loss": 0.81136048, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01160198, + "auxiliary_loss_mlp": 0.01149809, + "balance_loss_clip": 1.00206006, + "balance_loss_mlp": 1.00074983, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 4.244350500470733, + "language_loss": 0.74529433, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76839441, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.5160062313079834 + }, + { + "auxiliary_loss_clip": 0.0112807, + "auxiliary_loss_mlp": 0.01149833, + "balance_loss_clip": 1.00224936, + "balance_loss_mlp": 1.0011549, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 2.060552485301097, + "language_loss": 0.64865816, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.67143714, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.859731435775757 + }, + { + "auxiliary_loss_clip": 0.01124465, + "auxiliary_loss_mlp": 0.01141005, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00138748, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.282544518666437, + "language_loss": 0.65145671, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67411143, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 3.102843999862671 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01150054, + "balance_loss_clip": 1.00213134, + "balance_loss_mlp": 1.00080454, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 1.7204878283567473, + "language_loss": 0.80446249, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82740986, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.6745922565460205 + }, + { + "auxiliary_loss_clip": 0.01176682, + "auxiliary_loss_mlp": 0.01149629, + "balance_loss_clip": 1.00238347, + "balance_loss_mlp": 1.00066543, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.8173379089533075, + "language_loss": 0.7848525, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80811566, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.53727388381958 + }, + { + "auxiliary_loss_clip": 0.01145131, + "auxiliary_loss_mlp": 0.01150144, + "balance_loss_clip": 1.00206053, + "balance_loss_mlp": 1.00089407, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 6.69572293024898, + "language_loss": 0.80538762, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82834035, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.6584606170654297 + }, + { + "auxiliary_loss_clip": 0.01159932, + "auxiliary_loss_mlp": 0.00748805, + "balance_loss_clip": 1.00213671, + "balance_loss_mlp": 1.00120234, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.9804776826434742, + "language_loss": 0.69005668, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70914406, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.5750718116760254 + }, + { + "auxiliary_loss_clip": 0.01129864, + "auxiliary_loss_mlp": 0.01149966, + "balance_loss_clip": 1.00205398, + "balance_loss_mlp": 1.00090647, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 1.9895574243907732, + "language_loss": 0.90732682, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.93012512, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.588324785232544 + }, + { + "auxiliary_loss_clip": 0.01143491, + "auxiliary_loss_mlp": 0.01150643, + "balance_loss_clip": 1.00219107, + "balance_loss_mlp": 1.00091577, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.508268386947262, + "language_loss": 0.8973825, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.92032379, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.5087740421295166 + }, + { + "auxiliary_loss_clip": 0.01176622, + "auxiliary_loss_mlp": 0.01149396, + "balance_loss_clip": 1.00232661, + "balance_loss_mlp": 1.00100398, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 2.7397980296614413, + "language_loss": 0.75148797, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77474809, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 3.861734390258789 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01149665, + "balance_loss_clip": 1.00234628, + "balance_loss_mlp": 1.00089169, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.9997014379727887, + "language_loss": 0.62235123, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64486444, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.863780975341797 + }, + { + "auxiliary_loss_clip": 0.01128682, + "auxiliary_loss_mlp": 0.01149448, + "balance_loss_clip": 1.00208724, + "balance_loss_mlp": 1.0006752, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.8308862645064987, + "language_loss": 0.79142392, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81420517, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.6092402935028076 + }, + { + "auxiliary_loss_clip": 0.01126778, + "auxiliary_loss_mlp": 0.0115091, + "balance_loss_clip": 1.00215018, + "balance_loss_mlp": 1.00146937, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.741143936779082, + "language_loss": 0.76567698, + "learning_rate": 3.620228790579645e-06, + "loss": 0.78845394, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.5741779804229736 + }, + { + "auxiliary_loss_clip": 0.01144591, + "auxiliary_loss_mlp": 0.01149897, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00083733, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.2593098640848375, + "language_loss": 0.78515899, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.8081038, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.53113055229187 + }, + { + "auxiliary_loss_clip": 0.01095737, + "auxiliary_loss_mlp": 0.01150646, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00091958, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 3.4901291512674586, + "language_loss": 0.68128443, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70374823, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.696211576461792 + }, + { + "auxiliary_loss_clip": 0.01145268, + "auxiliary_loss_mlp": 0.01149841, + "balance_loss_clip": 1.00210845, + "balance_loss_mlp": 1.00087678, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.5301851998116622, + "language_loss": 0.80437058, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82732171, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.641444206237793 + }, + { + "auxiliary_loss_clip": 0.01143553, + "auxiliary_loss_mlp": 0.01151131, + "balance_loss_clip": 1.0021497, + "balance_loss_mlp": 1.00121391, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 1.7426248770916928, + "language_loss": 0.8664062, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88935304, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 4.052062511444092 + }, + { + "auxiliary_loss_clip": 0.01144438, + "auxiliary_loss_mlp": 0.01149875, + "balance_loss_clip": 1.00231886, + "balance_loss_mlp": 1.00091136, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.8120335225391049, + "language_loss": 0.7496534, + "learning_rate": 3.619086370692945e-06, + "loss": 0.7725966, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.6114962100982666 + }, + { + "auxiliary_loss_clip": 0.01176699, + "auxiliary_loss_mlp": 0.01149724, + "balance_loss_clip": 1.00227356, + "balance_loss_mlp": 1.00085509, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.333424731989107, + "language_loss": 0.79252493, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81578922, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 5.318214178085327 + }, + { + "auxiliary_loss_clip": 0.01126535, + "auxiliary_loss_mlp": 0.01149704, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.0008359, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2807478430592845, + "language_loss": 0.82866156, + "learning_rate": 3.618628972906178e-06, + "loss": 0.85142386, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.5678932666778564 + }, + { + "auxiliary_loss_clip": 0.01176814, + "auxiliary_loss_mlp": 0.01149811, + "balance_loss_clip": 1.00235593, + "balance_loss_mlp": 1.00103807, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 1.9405998219399347, + "language_loss": 0.8477515, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.87101769, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.567349910736084 + }, + { + "auxiliary_loss_clip": 0.01150033, + "auxiliary_loss_mlp": 0.011495, + "balance_loss_clip": 1.00240052, + "balance_loss_mlp": 1.00101292, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.6873662988933358, + "language_loss": 0.79001951, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81301481, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.6330957412719727 + }, + { + "auxiliary_loss_clip": 0.01092886, + "auxiliary_loss_mlp": 0.01149707, + "balance_loss_clip": 1.00170994, + "balance_loss_mlp": 1.00102878, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 2.0438542014798875, + "language_loss": 0.7741074, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79653335, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 2.6881277561187744 + }, + { + "auxiliary_loss_clip": 0.01161385, + "auxiliary_loss_mlp": 0.01150698, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00097132, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.512990692018565, + "language_loss": 0.72050631, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74362719, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.484285354614258 + }, + { + "auxiliary_loss_clip": 0.01176452, + "auxiliary_loss_mlp": 0.01149811, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00094259, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.0559861410082956, + "language_loss": 0.86652958, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.8897922, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.507657289505005 + }, + { + "auxiliary_loss_clip": 0.01144536, + "auxiliary_loss_mlp": 0.0115001, + "balance_loss_clip": 1.00217986, + "balance_loss_mlp": 1.00085545, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0308004019008217, + "language_loss": 0.80227566, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82522112, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.580444574356079 + }, + { + "auxiliary_loss_clip": 0.01143164, + "auxiliary_loss_mlp": 0.01149163, + "balance_loss_clip": 1.00202215, + "balance_loss_mlp": 1.00096238, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.7215134812805863, + "language_loss": 0.86819863, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89112186, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.668698310852051 + }, + { + "auxiliary_loss_clip": 0.0114846, + "auxiliary_loss_mlp": 0.00748619, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00106275, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.7055151682851195, + "language_loss": 0.73083788, + "learning_rate": 3.616796927310559e-06, + "loss": 0.74980867, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.5565412044525146 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01149763, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00079894, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.7328343701613635, + "language_loss": 0.7546283, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77741027, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.5919809341430664 + }, + { + "auxiliary_loss_clip": 0.01176589, + "auxiliary_loss_mlp": 0.0114955, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00115776, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.978267276351153, + "language_loss": 0.88539529, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90865672, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.5132546424865723 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01149582, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00090384, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6360222054005384, + "language_loss": 0.84744, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.87010449, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.632000207901001 + }, + { + "auxiliary_loss_clip": 0.01145708, + "auxiliary_loss_mlp": 0.01149218, + "balance_loss_clip": 1.00201595, + "balance_loss_mlp": 1.00111246, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6657554356274962, + "language_loss": 0.76704127, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78999054, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.6083285808563232 + }, + { + "auxiliary_loss_clip": 0.01159931, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_clip": 1.00232685, + "balance_loss_mlp": 1.00085425, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.7756963601196718, + "language_loss": 0.84682572, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.8699156, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.577829360961914 + }, + { + "auxiliary_loss_clip": 0.01126567, + "auxiliary_loss_mlp": 0.01150439, + "balance_loss_clip": 1.00224209, + "balance_loss_mlp": 1.00090241, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.5814622918127577, + "language_loss": 0.86297101, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88574111, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.588090419769287 + }, + { + "auxiliary_loss_clip": 0.01176679, + "auxiliary_loss_mlp": 0.01149983, + "balance_loss_clip": 1.00227094, + "balance_loss_mlp": 1.0009234, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.051230650475862, + "language_loss": 0.79013193, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81339854, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.5599589347839355 + }, + { + "auxiliary_loss_clip": 0.01144574, + "auxiliary_loss_mlp": 0.01149555, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00106847, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.6433843110079214, + "language_loss": 0.7661587, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78910005, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.579407215118408 + }, + { + "auxiliary_loss_clip": 0.01129347, + "auxiliary_loss_mlp": 0.01149369, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00097728, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 2.025494831362588, + "language_loss": 0.74354887, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76633608, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.697999954223633 + }, + { + "auxiliary_loss_clip": 0.01176447, + "auxiliary_loss_mlp": 0.01149005, + "balance_loss_clip": 1.00212848, + "balance_loss_mlp": 1.00089908, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.143903170148088, + "language_loss": 0.75529027, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77854478, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.489194393157959 + }, + { + "auxiliary_loss_clip": 0.01149181, + "auxiliary_loss_mlp": 0.01149712, + "balance_loss_clip": 1.00261486, + "balance_loss_mlp": 1.00074792, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.8007593431645976, + "language_loss": 0.87640536, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89939427, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.5487253665924072 + }, + { + "auxiliary_loss_clip": 0.0111796, + "auxiliary_loss_mlp": 0.01149274, + "balance_loss_clip": 1.00232804, + "balance_loss_mlp": 1.00097787, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 1.9402638700760921, + "language_loss": 0.81669563, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83936799, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.641145944595337 + }, + { + "auxiliary_loss_clip": 0.01160705, + "auxiliary_loss_mlp": 0.01149477, + "balance_loss_clip": 1.00217485, + "balance_loss_mlp": 1.00070429, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.250578780625317, + "language_loss": 0.63446581, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65756762, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.491157293319702 + }, + { + "auxiliary_loss_clip": 0.01159758, + "auxiliary_loss_mlp": 0.01149487, + "balance_loss_clip": 1.00206637, + "balance_loss_mlp": 1.00090492, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.460242959590069, + "language_loss": 0.75809592, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78118837, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.4978389739990234 + }, + { + "auxiliary_loss_clip": 0.01144817, + "auxiliary_loss_mlp": 0.011498, + "balance_loss_clip": 1.00239754, + "balance_loss_mlp": 1.00102687, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8379968599061678, + "language_loss": 0.81061256, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83355868, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.5595836639404297 + }, + { + "auxiliary_loss_clip": 0.01159842, + "auxiliary_loss_mlp": 0.01149519, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00093651, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.723146704457638, + "language_loss": 0.86297661, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88607019, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.540489435195923 + }, + { + "auxiliary_loss_clip": 0.01159704, + "auxiliary_loss_mlp": 0.00748601, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00118351, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.6869005554562129, + "language_loss": 0.76785588, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78693891, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.5698928833007812 + }, + { + "auxiliary_loss_clip": 0.01176449, + "auxiliary_loss_mlp": 0.01149566, + "balance_loss_clip": 1.00217438, + "balance_loss_mlp": 1.00107932, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.6555822178588642, + "language_loss": 0.79815292, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.8214131, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.4945011138916016 + }, + { + "auxiliary_loss_clip": 0.01142982, + "auxiliary_loss_mlp": 0.01148461, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00083256, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6404987997841376, + "language_loss": 0.79747999, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.8203944, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 4.029436111450195 + }, + { + "auxiliary_loss_clip": 0.01112349, + "auxiliary_loss_mlp": 0.01150288, + "balance_loss_clip": 1.00205076, + "balance_loss_mlp": 1.00094235, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 1.7462686019624838, + "language_loss": 0.82031029, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84293669, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.6750919818878174 + }, + { + "auxiliary_loss_clip": 0.01144557, + "auxiliary_loss_mlp": 0.01149326, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00083852, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.9423025588322722, + "language_loss": 0.83785951, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86079836, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.5456364154815674 + }, + { + "auxiliary_loss_clip": 0.01176367, + "auxiliary_loss_mlp": 0.01148753, + "balance_loss_clip": 1.00210035, + "balance_loss_mlp": 1.0008378, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.8578394025718743, + "language_loss": 0.7867471, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80999827, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.446646213531494 + }, + { + "auxiliary_loss_clip": 0.01142978, + "auxiliary_loss_mlp": 0.01149224, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00083232, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.9312070436738826, + "language_loss": 0.78478217, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80770415, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.6847705841064453 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01149685, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00119829, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.8897410369082588, + "language_loss": 0.70394123, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72687244, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.5651450157165527 + }, + { + "auxiliary_loss_clip": 0.0112755, + "auxiliary_loss_mlp": 0.01149248, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00104737, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.428216791572693, + "language_loss": 0.77203912, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79480708, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.6207163333892822 + }, + { + "auxiliary_loss_clip": 0.01145446, + "auxiliary_loss_mlp": 0.01149499, + "balance_loss_clip": 1.00245869, + "balance_loss_mlp": 1.00082135, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.9385473890180656, + "language_loss": 0.82491165, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84786111, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.5730173587799072 + }, + { + "auxiliary_loss_clip": 0.01159876, + "auxiliary_loss_mlp": 0.01149093, + "balance_loss_clip": 1.00220895, + "balance_loss_mlp": 1.00079632, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.7235968172363387, + "language_loss": 0.73233402, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75542372, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 3.9350550174713135 + }, + { + "auxiliary_loss_clip": 0.01143924, + "auxiliary_loss_mlp": 0.01148814, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00109029, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.429977333657625, + "language_loss": 0.76548159, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.78840899, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 3.924271821975708 + }, + { + "auxiliary_loss_clip": 0.01110541, + "auxiliary_loss_mlp": 0.01149617, + "balance_loss_clip": 1.00171232, + "balance_loss_mlp": 1.00103462, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.762038673878529, + "language_loss": 0.78634435, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80894589, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 4.193561553955078 + }, + { + "auxiliary_loss_clip": 0.01128202, + "auxiliary_loss_mlp": 0.01138264, + "balance_loss_clip": 1.00244522, + "balance_loss_mlp": 1.0001719, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9421152578510406, + "language_loss": 0.60087359, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62353826, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.134497880935669 + }, + { + "auxiliary_loss_clip": 0.01125866, + "auxiliary_loss_mlp": 0.01149422, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00074434, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.1902141822154317, + "language_loss": 0.77045655, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79320943, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.5902798175811768 + }, + { + "auxiliary_loss_clip": 0.0114505, + "auxiliary_loss_mlp": 0.01149523, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00065422, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.089956807255724, + "language_loss": 0.79045916, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81340486, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.561591863632202 + }, + { + "auxiliary_loss_clip": 0.01160282, + "auxiliary_loss_mlp": 0.01150522, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.0011766, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.895745071287261, + "language_loss": 0.91240442, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93551242, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.4943490028381348 + }, + { + "auxiliary_loss_clip": 0.0116113, + "auxiliary_loss_mlp": 0.01148928, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.00101256, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.702937497355896, + "language_loss": 0.7521441, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77524471, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.579075336456299 + }, + { + "auxiliary_loss_clip": 0.01159785, + "auxiliary_loss_mlp": 0.01149226, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00092983, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.108776931869859, + "language_loss": 0.8986088, + "learning_rate": 3.608735651752494e-06, + "loss": 0.92169893, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.500044345855713 + }, + { + "auxiliary_loss_clip": 0.01143381, + "auxiliary_loss_mlp": 0.01149084, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00088346, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.5438842644947812, + "language_loss": 0.74862587, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.77155054, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.6663150787353516 + }, + { + "auxiliary_loss_clip": 0.01159297, + "auxiliary_loss_mlp": 0.0114947, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00088763, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.5463246291617343, + "language_loss": 0.71735215, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.74043989, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.541383981704712 + }, + { + "auxiliary_loss_clip": 0.01159768, + "auxiliary_loss_mlp": 0.01149434, + "balance_loss_clip": 1.00224662, + "balance_loss_mlp": 1.00132859, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.6371802540529408, + "language_loss": 0.78448546, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80757749, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.573366165161133 + }, + { + "auxiliary_loss_clip": 0.01145698, + "auxiliary_loss_mlp": 0.01149299, + "balance_loss_clip": 1.00217009, + "balance_loss_mlp": 1.00090706, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.944151868818907, + "language_loss": 0.68678522, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70973521, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.6195778846740723 + }, + { + "auxiliary_loss_clip": 0.0117651, + "auxiliary_loss_mlp": 0.01149706, + "balance_loss_clip": 1.00222671, + "balance_loss_mlp": 1.00112379, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6956713223592488, + "language_loss": 0.80477488, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82803702, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.5167500972747803 + }, + { + "auxiliary_loss_clip": 0.01131705, + "auxiliary_loss_mlp": 0.01148888, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00106883, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 2.6961338087330504, + "language_loss": 0.78894091, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81174684, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.722712516784668 + }, + { + "auxiliary_loss_clip": 0.01110983, + "auxiliary_loss_mlp": 0.01137396, + "balance_loss_clip": 1.00254261, + "balance_loss_mlp": 1.00006711, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6538392152625665, + "language_loss": 0.5432505, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56573427, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.3057093620300293 + }, + { + "auxiliary_loss_clip": 0.01132217, + "auxiliary_loss_mlp": 0.01148847, + "balance_loss_clip": 1.00225413, + "balance_loss_mlp": 1.00083733, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 2.3439770073713033, + "language_loss": 0.70801651, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.73082709, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.621736526489258 + }, + { + "auxiliary_loss_clip": 0.0114285, + "auxiliary_loss_mlp": 0.01149059, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00104904, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.0060068810957574, + "language_loss": 0.74544179, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76836097, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.529134750366211 + }, + { + "auxiliary_loss_clip": 0.01176255, + "auxiliary_loss_mlp": 0.01148855, + "balance_loss_clip": 1.00208664, + "balance_loss_mlp": 1.00103545, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 1.8540958309693196, + "language_loss": 0.82501829, + "learning_rate": 3.606418687985928e-06, + "loss": 0.84826934, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.472369432449341 + }, + { + "auxiliary_loss_clip": 0.01143951, + "auxiliary_loss_mlp": 0.01149276, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.00107539, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.7247499048068027, + "language_loss": 0.82686031, + "learning_rate": 3.606186656428641e-06, + "loss": 0.8497926, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.5721027851104736 + }, + { + "auxiliary_loss_clip": 0.01143613, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_clip": 1.00229907, + "balance_loss_mlp": 1.00092041, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 4.696374009809638, + "language_loss": 0.72868979, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.75162572, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.5762205123901367 + }, + { + "auxiliary_loss_clip": 0.01129187, + "auxiliary_loss_mlp": 0.0114934, + "balance_loss_clip": 1.00213587, + "balance_loss_mlp": 1.00075746, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.138818156927645, + "language_loss": 0.64634603, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66913128, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.628297805786133 + }, + { + "auxiliary_loss_clip": 0.01161177, + "auxiliary_loss_mlp": 0.01149129, + "balance_loss_clip": 1.00220287, + "balance_loss_mlp": 1.00092769, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.7364950173082856, + "language_loss": 0.70712733, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.73023039, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.5146780014038086 + }, + { + "auxiliary_loss_clip": 0.01159809, + "auxiliary_loss_mlp": 0.01149443, + "balance_loss_clip": 1.00215507, + "balance_loss_mlp": 1.00095534, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.7584083232722718, + "language_loss": 0.89529228, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91838479, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.534846305847168 + }, + { + "auxiliary_loss_clip": 0.01176492, + "auxiliary_loss_mlp": 0.01149424, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.00093734, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.037363157834909, + "language_loss": 0.7424897, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76574886, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.4477009773254395 + }, + { + "auxiliary_loss_clip": 0.01144216, + "auxiliary_loss_mlp": 0.01148642, + "balance_loss_clip": 1.00195777, + "balance_loss_mlp": 1.00091779, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.4655685563089829, + "language_loss": 0.82822967, + "learning_rate": 3.604793188351095e-06, + "loss": 0.85115826, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.592524290084839 + }, + { + "auxiliary_loss_clip": 0.01143203, + "auxiliary_loss_mlp": 0.0114954, + "balance_loss_clip": 1.00200522, + "balance_loss_mlp": 1.00086176, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.7571909831862595, + "language_loss": 0.7597608, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.7826882, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.6036932468414307 + }, + { + "auxiliary_loss_clip": 0.01176251, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00078559, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6071069368349884, + "language_loss": 0.70724392, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73049629, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.518430471420288 + }, + { + "auxiliary_loss_clip": 0.0112686, + "auxiliary_loss_mlp": 0.01137393, + "balance_loss_clip": 1.00255203, + "balance_loss_mlp": 1.0000639, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.814233657953395, + "language_loss": 0.61910224, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64174485, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.1735715866088867 + }, + { + "auxiliary_loss_clip": 0.01144332, + "auxiliary_loss_mlp": 0.01149572, + "balance_loss_clip": 1.00223136, + "balance_loss_mlp": 1.00089371, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 3.7475702769872816, + "language_loss": 0.86306381, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88600278, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.5448944568634033 + }, + { + "auxiliary_loss_clip": 0.01144477, + "auxiliary_loss_mlp": 0.01148878, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.00086784, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.3026426665611577, + "language_loss": 0.72593087, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74886441, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 4.125001668930054 + }, + { + "auxiliary_loss_clip": 0.0114417, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_clip": 1.0021174, + "balance_loss_mlp": 1.00063014, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.3522884653779714, + "language_loss": 0.67110574, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69403481, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.5141944885253906 + }, + { + "auxiliary_loss_clip": 0.01145819, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.00215209, + "balance_loss_mlp": 1.00076294, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 1.894306710064158, + "language_loss": 0.75870919, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78166175, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.60048246383667 + }, + { + "auxiliary_loss_clip": 0.01110576, + "auxiliary_loss_mlp": 0.01148477, + "balance_loss_clip": 1.00165355, + "balance_loss_mlp": 1.00103927, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0934496842637484, + "language_loss": 0.91217619, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93476677, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.641700506210327 + }, + { + "auxiliary_loss_clip": 0.01160823, + "auxiliary_loss_mlp": 0.01148585, + "balance_loss_clip": 1.00200522, + "balance_loss_mlp": 1.00057459, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.6342458616714757, + "language_loss": 0.82532799, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84842205, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.6123344898223877 + }, + { + "auxiliary_loss_clip": 0.0117493, + "auxiliary_loss_mlp": 0.01137475, + "balance_loss_clip": 1.00258303, + "balance_loss_mlp": 1.00014544, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1374509672737805, + "language_loss": 0.65627491, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67939895, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.838312864303589 + }, + { + "auxiliary_loss_clip": 0.01176494, + "auxiliary_loss_mlp": 0.01149571, + "balance_loss_clip": 1.00219822, + "balance_loss_mlp": 1.00098872, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 1.946130837764317, + "language_loss": 0.77058208, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79384273, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.5518746376037598 + }, + { + "auxiliary_loss_clip": 0.01128641, + "auxiliary_loss_mlp": 0.01149222, + "balance_loss_clip": 1.00206494, + "balance_loss_mlp": 1.00082982, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 5.831997499271413, + "language_loss": 0.81070828, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.83348691, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.6387383937835693 + }, + { + "auxiliary_loss_clip": 0.01159557, + "auxiliary_loss_mlp": 0.0114856, + "balance_loss_clip": 1.00199974, + "balance_loss_mlp": 1.0008359, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.6248388814392105, + "language_loss": 0.76828414, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79136527, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 4.004822015762329 + }, + { + "auxiliary_loss_clip": 0.01127441, + "auxiliary_loss_mlp": 0.00748534, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.00084782, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.2320083621124596, + "language_loss": 0.96015453, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.97891426, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 3.9738569259643555 + }, + { + "auxiliary_loss_clip": 0.01160529, + "auxiliary_loss_mlp": 0.00748411, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00088573, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.8408060067260783, + "language_loss": 0.81162709, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83071649, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 4.0555291175842285 + }, + { + "auxiliary_loss_clip": 0.01128735, + "auxiliary_loss_mlp": 0.0114912, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00072837, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3574757072168007, + "language_loss": 0.79251975, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81529832, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.6441938877105713 + }, + { + "auxiliary_loss_clip": 0.01144026, + "auxiliary_loss_mlp": 0.01148983, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00097299, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.520043898574153, + "language_loss": 0.75288928, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.7758193, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.625885486602783 + }, + { + "auxiliary_loss_clip": 0.01144347, + "auxiliary_loss_mlp": 0.01148555, + "balance_loss_clip": 1.00205934, + "balance_loss_mlp": 1.00092638, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.6474201777983286, + "language_loss": 0.63916528, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6620943, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.6264467239379883 + }, + { + "auxiliary_loss_clip": 0.01144164, + "auxiliary_loss_mlp": 0.01148121, + "balance_loss_clip": 1.00217092, + "balance_loss_mlp": 1.00077832, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.6093284018654408, + "language_loss": 0.81878304, + "learning_rate": 3.60036609571682e-06, + "loss": 0.84170592, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.619565010070801 + }, + { + "auxiliary_loss_clip": 0.01145029, + "auxiliary_loss_mlp": 0.0114955, + "balance_loss_clip": 1.00215673, + "balance_loss_mlp": 1.00106335, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.687253621803218, + "language_loss": 0.78742909, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81037492, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.6299586296081543 + }, + { + "auxiliary_loss_clip": 0.01127581, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_clip": 1.00175524, + "balance_loss_mlp": 1.0009321, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.657533271726118, + "language_loss": 0.85242248, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87519056, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.623281955718994 + }, + { + "auxiliary_loss_clip": 0.01159691, + "auxiliary_loss_mlp": 0.01149355, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00086761, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.0069840124140756, + "language_loss": 0.7715295, + "learning_rate": 3.59966507689401e-06, + "loss": 0.79461998, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.5048840045928955 + }, + { + "auxiliary_loss_clip": 0.01143404, + "auxiliary_loss_mlp": 0.00748566, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.0009439, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 9.306298962392118, + "language_loss": 0.78730369, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.80622339, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.5707569122314453 + }, + { + "auxiliary_loss_clip": 0.0114428, + "auxiliary_loss_mlp": 0.01149734, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00115168, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 1.8510026639780268, + "language_loss": 0.69489753, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71783769, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.7188825607299805 + }, + { + "auxiliary_loss_clip": 0.01159881, + "auxiliary_loss_mlp": 0.01149762, + "balance_loss_clip": 1.00222754, + "balance_loss_mlp": 1.00117981, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.0498155275038656, + "language_loss": 0.65351599, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67661244, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.5468544960021973 + }, + { + "auxiliary_loss_clip": 0.01110844, + "auxiliary_loss_mlp": 0.01149945, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00107658, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8928574351430703, + "language_loss": 0.74921274, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77182066, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.6334028244018555 + }, + { + "auxiliary_loss_clip": 0.01143774, + "auxiliary_loss_mlp": 0.01148554, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00102091, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.540294354120187, + "language_loss": 0.81678981, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.8397131, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.586583137512207 + }, + { + "auxiliary_loss_clip": 0.01160019, + "auxiliary_loss_mlp": 0.01148982, + "balance_loss_clip": 1.00224495, + "balance_loss_mlp": 1.00087655, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 1.9281516184380707, + "language_loss": 0.7818439, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80493391, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.5301146507263184 + }, + { + "auxiliary_loss_clip": 0.0114866, + "auxiliary_loss_mlp": 0.00748407, + "balance_loss_clip": 1.00242627, + "balance_loss_mlp": 1.00079656, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.7397370084509367, + "language_loss": 0.82771349, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.8466841, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.571195125579834 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01150582, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.00133228, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.53304898973113, + "language_loss": 0.83427978, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.85691297, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.6445250511169434 + }, + { + "auxiliary_loss_clip": 0.01160859, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.00095439, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 2.0479723702821913, + "language_loss": 0.7018702, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72496843, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.664949417114258 + }, + { + "auxiliary_loss_clip": 0.01160912, + "auxiliary_loss_mlp": 0.01148563, + "balance_loss_clip": 1.00212514, + "balance_loss_mlp": 1.00093424, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.3382154302075575, + "language_loss": 0.66581917, + "learning_rate": 3.597324405965139e-06, + "loss": 0.68891394, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.637006998062134 + }, + { + "auxiliary_loss_clip": 0.01165383, + "auxiliary_loss_mlp": 0.01149363, + "balance_loss_clip": 1.00254893, + "balance_loss_mlp": 1.00097132, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.6929579362445522, + "language_loss": 0.83594906, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85909659, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.595231294631958 + }, + { + "auxiliary_loss_clip": 0.01159142, + "auxiliary_loss_mlp": 0.01148772, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.000857, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.339838912054637, + "language_loss": 0.87518835, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89826751, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.5016398429870605 + }, + { + "auxiliary_loss_clip": 0.01144323, + "auxiliary_loss_mlp": 0.01149264, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00106263, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 1.7630786525427282, + "language_loss": 0.74887329, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.7718091, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.62642502784729 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01148908, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00089741, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.674861028754872, + "language_loss": 0.7471211, + "learning_rate": 3.596386441116659e-06, + "loss": 0.7702024, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.5383424758911133 + }, + { + "auxiliary_loss_clip": 0.01159514, + "auxiliary_loss_mlp": 0.01148868, + "balance_loss_clip": 1.00212455, + "balance_loss_mlp": 1.00095367, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.7784475364525834, + "language_loss": 0.80889034, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83197415, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.6084389686584473 + }, + { + "auxiliary_loss_clip": 0.01143187, + "auxiliary_loss_mlp": 0.01149649, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.00106597, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.569229516143731, + "language_loss": 0.69311422, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71604258, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.5422511100769043 + }, + { + "auxiliary_loss_clip": 0.0110892, + "auxiliary_loss_mlp": 0.01148943, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00083721, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.500129089316491, + "language_loss": 0.82992715, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.8525058, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.702317237854004 + }, + { + "auxiliary_loss_clip": 0.01176389, + "auxiliary_loss_mlp": 0.01149186, + "balance_loss_clip": 1.00228631, + "balance_loss_mlp": 1.00079393, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.465028638734271, + "language_loss": 0.66475874, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68801445, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.5132546424865723 + }, + { + "auxiliary_loss_clip": 0.01158947, + "auxiliary_loss_mlp": 0.01136857, + "balance_loss_clip": 1.00289702, + "balance_loss_mlp": 1.00029087, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.7878986009328245, + "language_loss": 0.56790185, + "learning_rate": 3.595212623082357e-06, + "loss": 0.59085989, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.194256544113159 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01147461, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00088096, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 1.9009415112587198, + "language_loss": 0.73337752, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75627971, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.5249087810516357 + }, + { + "auxiliary_loss_clip": 0.01159789, + "auxiliary_loss_mlp": 0.01149053, + "balance_loss_clip": 1.00233722, + "balance_loss_mlp": 1.00104249, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8236308147988411, + "language_loss": 0.87677097, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89985931, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 3.9953649044036865 + }, + { + "auxiliary_loss_clip": 0.01143022, + "auxiliary_loss_mlp": 0.01149412, + "balance_loss_clip": 1.00211072, + "balance_loss_mlp": 1.00101995, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.2846870512192905, + "language_loss": 0.81310737, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83603179, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.560490131378174 + }, + { + "auxiliary_loss_clip": 0.01094096, + "auxiliary_loss_mlp": 0.01148564, + "balance_loss_clip": 1.00159657, + "balance_loss_mlp": 1.00083971, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.9890536751920536, + "language_loss": 0.8665787, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88900524, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.645845413208008 + }, + { + "auxiliary_loss_clip": 0.01142793, + "auxiliary_loss_mlp": 0.01148374, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.0008409, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.384243549311521, + "language_loss": 0.70782858, + "learning_rate": 3.594037292782607e-06, + "loss": 0.73074019, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.5692615509033203 + }, + { + "auxiliary_loss_clip": 0.01093864, + "auxiliary_loss_mlp": 0.0114761, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00064874, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5200475535641553, + "language_loss": 0.84425962, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86667436, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.767443895339966 + }, + { + "auxiliary_loss_clip": 0.01159937, + "auxiliary_loss_mlp": 0.01148749, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.00092971, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.836173307025677, + "language_loss": 0.67270935, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69579619, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.740032434463501 + }, + { + "auxiliary_loss_clip": 0.01129073, + "auxiliary_loss_mlp": 0.01149224, + "balance_loss_clip": 1.00219953, + "balance_loss_mlp": 1.00102305, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.000655513042501, + "language_loss": 0.75445753, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77724051, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.674222230911255 + }, + { + "auxiliary_loss_clip": 0.01108991, + "auxiliary_loss_mlp": 0.01148356, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00101376, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.7177987800792693, + "language_loss": 0.87613565, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89870906, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01127535, + "auxiliary_loss_mlp": 0.01148641, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00082171, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.9775395746375406, + "language_loss": 0.75518215, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77794385, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 4.0333263874053955 + }, + { + "auxiliary_loss_clip": 0.0112946, + "auxiliary_loss_mlp": 0.01148911, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00118649, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.9860148077858655, + "language_loss": 0.86101878, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88380253, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 5.503667593002319 + }, + { + "auxiliary_loss_clip": 0.01129182, + "auxiliary_loss_mlp": 0.01149169, + "balance_loss_clip": 1.00223827, + "balance_loss_mlp": 1.00106359, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.371603642558404, + "language_loss": 0.82286799, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84565145, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.6320321559906006 + }, + { + "auxiliary_loss_clip": 0.01160118, + "auxiliary_loss_mlp": 0.01148792, + "balance_loss_clip": 1.00229919, + "balance_loss_mlp": 1.00097251, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.5577452136198995, + "language_loss": 0.79569405, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81878316, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.5355961322784424 + }, + { + "auxiliary_loss_clip": 0.01127252, + "auxiliary_loss_mlp": 0.01136687, + "balance_loss_clip": 1.00286889, + "balance_loss_mlp": 1.0001204, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9334162652735384, + "language_loss": 0.65429229, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67693168, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.1121649742126465 + }, + { + "auxiliary_loss_clip": 0.01159657, + "auxiliary_loss_mlp": 0.01148291, + "balance_loss_clip": 1.00231719, + "balance_loss_mlp": 1.00113916, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.8227145811159038, + "language_loss": 0.75882, + "learning_rate": 3.591682099845058e-06, + "loss": 0.78189939, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.5342493057250977 + }, + { + "auxiliary_loss_clip": 0.01144471, + "auxiliary_loss_mlp": 0.01148986, + "balance_loss_clip": 1.00228429, + "balance_loss_mlp": 1.00097537, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 2.113561545657212, + "language_loss": 0.68910515, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71203971, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.5540568828582764 + }, + { + "auxiliary_loss_clip": 0.01176328, + "auxiliary_loss_mlp": 0.01148727, + "balance_loss_clip": 1.0023253, + "balance_loss_mlp": 1.00081253, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 1.794476341422591, + "language_loss": 0.79503107, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81828165, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.4661664962768555 + }, + { + "auxiliary_loss_clip": 0.01159599, + "auxiliary_loss_mlp": 0.01148276, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.00093341, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.7914564139307394, + "language_loss": 0.82830638, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85138512, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.548348903656006 + }, + { + "auxiliary_loss_clip": 0.01160939, + "auxiliary_loss_mlp": 0.01148579, + "balance_loss_clip": 1.00229812, + "balance_loss_mlp": 1.0009501, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.4523447366860167, + "language_loss": 0.66717255, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.69026774, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.719547748565674 + }, + { + "auxiliary_loss_clip": 0.01160729, + "auxiliary_loss_mlp": 0.011484, + "balance_loss_clip": 1.00225675, + "balance_loss_mlp": 1.00124836, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.9764242918364632, + "language_loss": 0.77530813, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79839945, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.6341640949249268 + }, + { + "auxiliary_loss_clip": 0.01160761, + "auxiliary_loss_mlp": 0.01148723, + "balance_loss_clip": 1.00219154, + "balance_loss_mlp": 1.00099885, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.5771924884588315, + "language_loss": 0.78349054, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80658543, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.5350637435913086 + }, + { + "auxiliary_loss_clip": 0.0112798, + "auxiliary_loss_mlp": 0.0114765, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00097466, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.085815363249525, + "language_loss": 0.76564986, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78840613, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.6988470554351807 + }, + { + "auxiliary_loss_clip": 0.01143328, + "auxiliary_loss_mlp": 0.01148898, + "balance_loss_clip": 1.00211942, + "balance_loss_mlp": 1.00088763, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 9.61508586171949, + "language_loss": 0.69682205, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71974427, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.5466434955596924 + }, + { + "auxiliary_loss_clip": 0.0115787, + "auxiliary_loss_mlp": 0.01138546, + "balance_loss_clip": 1.00234115, + "balance_loss_mlp": 1.00121677, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7906613859615585, + "language_loss": 0.61064863, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63361281, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.053560972213745 + }, + { + "auxiliary_loss_clip": 0.01159052, + "auxiliary_loss_mlp": 0.01148511, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00107348, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.452459783424939, + "language_loss": 0.78014511, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80322075, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.51295804977417 + }, + { + "auxiliary_loss_clip": 0.01159552, + "auxiliary_loss_mlp": 0.0114903, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.00082898, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.8305694845690035, + "language_loss": 0.7154209, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73850667, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.686441421508789 + }, + { + "auxiliary_loss_clip": 0.01144141, + "auxiliary_loss_mlp": 0.00748374, + "balance_loss_clip": 1.00221086, + "balance_loss_mlp": 1.00074935, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 1.8103143241577497, + "language_loss": 0.76249081, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78141594, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.6386470794677734 + }, + { + "auxiliary_loss_clip": 0.01176318, + "auxiliary_loss_mlp": 0.0114836, + "balance_loss_clip": 1.0023576, + "balance_loss_mlp": 1.00082684, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.664308468398209, + "language_loss": 0.6984899, + "learning_rate": 3.588611327033723e-06, + "loss": 0.72173667, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.4944956302642822 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.0114902, + "balance_loss_clip": 1.00283992, + "balance_loss_mlp": 1.0010097, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 3.7156908309587267, + "language_loss": 0.67586368, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69868904, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.5789296627044678 + }, + { + "auxiliary_loss_clip": 0.01159748, + "auxiliary_loss_mlp": 0.01149054, + "balance_loss_clip": 1.00235307, + "balance_loss_mlp": 1.00085318, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.8971526717932297, + "language_loss": 0.79895544, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.5988314151763916 + }, + { + "auxiliary_loss_clip": 0.01145522, + "auxiliary_loss_mlp": 0.01149705, + "balance_loss_clip": 1.0021776, + "balance_loss_mlp": 1.00102746, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 2.248200414658733, + "language_loss": 0.6542052, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67715752, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.604701280593872 + }, + { + "auxiliary_loss_clip": 0.01176291, + "auxiliary_loss_mlp": 0.01148808, + "balance_loss_clip": 1.00225091, + "balance_loss_mlp": 1.00117874, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 1.861101846441361, + "language_loss": 0.7168377, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.7400887, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.5396029949188232 + }, + { + "auxiliary_loss_clip": 0.01111131, + "auxiliary_loss_mlp": 0.01148522, + "balance_loss_clip": 1.0020467, + "balance_loss_mlp": 1.00089324, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.8272380631172287, + "language_loss": 0.7787385, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.8013351, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.763537645339966 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.00748483, + "balance_loss_clip": 1.00218606, + "balance_loss_mlp": 1.00081074, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.384736262267494, + "language_loss": 0.91182363, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93075591, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.560976505279541 + }, + { + "auxiliary_loss_clip": 0.01111164, + "auxiliary_loss_mlp": 0.01148754, + "balance_loss_clip": 1.00209224, + "balance_loss_mlp": 1.00093484, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.7725594122494976, + "language_loss": 0.76972961, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.79232877, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.6904115676879883 + }, + { + "auxiliary_loss_clip": 0.01159381, + "auxiliary_loss_mlp": 0.0114787, + "balance_loss_clip": 1.00200677, + "balance_loss_mlp": 1.00062263, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.7982636465517192, + "language_loss": 0.84069192, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86376446, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.541701078414917 + }, + { + "auxiliary_loss_clip": 0.01111252, + "auxiliary_loss_mlp": 0.01149164, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00086808, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1691040330156537, + "language_loss": 0.83392465, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85652882, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.6607606410980225 + }, + { + "auxiliary_loss_clip": 0.01161053, + "auxiliary_loss_mlp": 0.00748425, + "balance_loss_clip": 1.00222421, + "balance_loss_mlp": 1.00073719, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.5406092715803046, + "language_loss": 0.86005795, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87915272, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.6046364307403564 + }, + { + "auxiliary_loss_clip": 0.01131652, + "auxiliary_loss_mlp": 0.01148247, + "balance_loss_clip": 1.00245237, + "balance_loss_mlp": 1.00090468, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.4536887471877136, + "language_loss": 0.75330496, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77610397, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.6282925605773926 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01148388, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00104523, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.6981790187837353, + "language_loss": 0.74896729, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77171314, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 3.9645538330078125 + }, + { + "auxiliary_loss_clip": 0.01176217, + "auxiliary_loss_mlp": 0.01148672, + "balance_loss_clip": 1.0022167, + "balance_loss_mlp": 1.00075746, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 1.5619262046332854, + "language_loss": 0.7020306, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72527951, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.600170135498047 + }, + { + "auxiliary_loss_clip": 0.01176457, + "auxiliary_loss_mlp": 0.011498, + "balance_loss_clip": 1.00231981, + "balance_loss_mlp": 1.00112224, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.8489926340964524, + "language_loss": 0.94743669, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97069925, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.5285966396331787 + }, + { + "auxiliary_loss_clip": 0.01160816, + "auxiliary_loss_mlp": 0.01148705, + "balance_loss_clip": 1.00233126, + "balance_loss_mlp": 1.00117159, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 3.1034271977214245, + "language_loss": 0.73051417, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.7536093, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.5358004570007324 + }, + { + "auxiliary_loss_clip": 0.01144715, + "auxiliary_loss_mlp": 0.01148395, + "balance_loss_clip": 1.00197923, + "balance_loss_mlp": 1.00086188, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.827635056019525, + "language_loss": 0.8199513, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84288239, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.578490734100342 + }, + { + "auxiliary_loss_clip": 0.01161032, + "auxiliary_loss_mlp": 0.01147614, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00084376, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.7265894605020378, + "language_loss": 0.73398697, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75707346, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.5129125118255615 + }, + { + "auxiliary_loss_clip": 0.01159525, + "auxiliary_loss_mlp": 0.01148334, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00108719, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.8450745537779358, + "language_loss": 0.79770541, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.82078397, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.6007003784179688 + }, + { + "auxiliary_loss_clip": 0.01176281, + "auxiliary_loss_mlp": 0.01148902, + "balance_loss_clip": 1.00229013, + "balance_loss_mlp": 1.00070095, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 1.7529276441616306, + "language_loss": 0.70595002, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72920179, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.468492031097412 + }, + { + "auxiliary_loss_clip": 0.01159786, + "auxiliary_loss_mlp": 0.01149135, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00112426, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 1.8668191609722837, + "language_loss": 0.69180536, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71489453, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 5.445667028427124 + }, + { + "auxiliary_loss_clip": 0.01160976, + "auxiliary_loss_mlp": 0.01149409, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00092161, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.5689657273867865, + "language_loss": 0.77607238, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.79917622, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 4.17738938331604 + }, + { + "auxiliary_loss_clip": 0.011422, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_clip": 1.0022167, + "balance_loss_mlp": 1.00002027, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8492033001957227, + "language_loss": 0.60557467, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62836254, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.091002941131592 + }, + { + "auxiliary_loss_clip": 0.01144012, + "auxiliary_loss_mlp": 0.01148189, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00094128, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.722581097892403, + "language_loss": 0.81180406, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83472598, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.6770131587982178 + }, + { + "auxiliary_loss_clip": 0.01176403, + "auxiliary_loss_mlp": 0.00748225, + "balance_loss_clip": 1.0024029, + "balance_loss_mlp": 1.00066686, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.7796039709546596, + "language_loss": 0.61027563, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.62952197, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.584632396697998 + }, + { + "auxiliary_loss_clip": 0.01144425, + "auxiliary_loss_mlp": 0.0114841, + "balance_loss_clip": 1.00228095, + "balance_loss_mlp": 1.00087643, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.8368899194191077, + "language_loss": 0.70916879, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.73209709, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.5913503170013428 + }, + { + "auxiliary_loss_clip": 0.011598, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_clip": 1.00221372, + "balance_loss_mlp": 1.00114846, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.1851623493632277, + "language_loss": 0.80906135, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83214808, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.5200977325439453 + }, + { + "auxiliary_loss_clip": 0.01095488, + "auxiliary_loss_mlp": 0.01149338, + "balance_loss_clip": 1.0018568, + "balance_loss_mlp": 1.00094664, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.6538204740996045, + "language_loss": 0.7502563, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.7727046, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.8064091205596924 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01148293, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00095046, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 3.3752067676885793, + "language_loss": 0.90297556, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92558372, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.6234724521636963 + }, + { + "auxiliary_loss_clip": 0.01142917, + "auxiliary_loss_mlp": 0.01149262, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00106049, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.5813841879167374, + "language_loss": 0.71663332, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.739555, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.5774388313293457 + }, + { + "auxiliary_loss_clip": 0.01176339, + "auxiliary_loss_mlp": 0.01148976, + "balance_loss_clip": 1.0024035, + "balance_loss_mlp": 1.00096583, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 2.645350334525367, + "language_loss": 0.68480796, + "learning_rate": 3.581486106120537e-06, + "loss": 0.7080611, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.547454833984375 + }, + { + "auxiliary_loss_clip": 0.01127544, + "auxiliary_loss_mlp": 0.01149456, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00106382, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.9990348900065387, + "language_loss": 0.77439523, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79716522, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.6851792335510254 + }, + { + "auxiliary_loss_clip": 0.01142652, + "auxiliary_loss_mlp": 0.01136546, + "balance_loss_clip": 1.002689, + "balance_loss_mlp": 0.99998009, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7859484204215936, + "language_loss": 0.59146696, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61425894, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.31671142578125 + }, + { + "auxiliary_loss_clip": 0.01132718, + "auxiliary_loss_mlp": 0.01148899, + "balance_loss_clip": 1.00247133, + "balance_loss_mlp": 1.00069833, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.831443122391669, + "language_loss": 0.80425781, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82707393, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.679572105407715 + }, + { + "auxiliary_loss_clip": 0.01159784, + "auxiliary_loss_mlp": 0.01148815, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00099516, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.3885222584235297, + "language_loss": 0.88339865, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9064846, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.500685453414917 + }, + { + "auxiliary_loss_clip": 0.01176311, + "auxiliary_loss_mlp": 0.01148955, + "balance_loss_clip": 1.00239813, + "balance_loss_mlp": 1.00094521, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.8749590335243638, + "language_loss": 0.73519957, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75845218, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.582815647125244 + }, + { + "auxiliary_loss_clip": 0.01159569, + "auxiliary_loss_mlp": 0.01148739, + "balance_loss_clip": 1.00225699, + "balance_loss_mlp": 1.00072932, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 2.1221140976525823, + "language_loss": 0.84390736, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86699045, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.6145975589752197 + }, + { + "auxiliary_loss_clip": 0.01144803, + "auxiliary_loss_mlp": 0.01149568, + "balance_loss_clip": 1.00229108, + "balance_loss_mlp": 1.00117624, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 2.329424170805187, + "language_loss": 0.87399018, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89693391, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.662130355834961 + }, + { + "auxiliary_loss_clip": 0.01159503, + "auxiliary_loss_mlp": 0.01148699, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.00087929, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.252159778617325, + "language_loss": 0.77137071, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79445279, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.5538392066955566 + }, + { + "auxiliary_loss_clip": 0.01116517, + "auxiliary_loss_mlp": 0.0074829, + "balance_loss_clip": 1.00268817, + "balance_loss_mlp": 1.00069404, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.7301191696677716, + "language_loss": 0.73251295, + "learning_rate": 3.579338004009412e-06, + "loss": 0.7511611, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 2.900667667388916 + }, + { + "auxiliary_loss_clip": 0.01176137, + "auxiliary_loss_mlp": 0.01148271, + "balance_loss_clip": 1.00238979, + "balance_loss_mlp": 1.00092888, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 2.0399319917709158, + "language_loss": 0.82482302, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84806716, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.5136194229125977 + }, + { + "auxiliary_loss_clip": 0.01134529, + "auxiliary_loss_mlp": 0.0114953, + "balance_loss_clip": 1.00258636, + "balance_loss_mlp": 1.00094759, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.5474179501765275, + "language_loss": 0.65058231, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67342293, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.7905936241149902 + }, + { + "auxiliary_loss_clip": 0.01126951, + "auxiliary_loss_mlp": 0.01149287, + "balance_loss_clip": 1.00234103, + "balance_loss_mlp": 1.00079942, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 1.7571414713430495, + "language_loss": 0.79242206, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81518447, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.614130735397339 + }, + { + "auxiliary_loss_clip": 0.01160698, + "auxiliary_loss_mlp": 0.01148829, + "balance_loss_clip": 1.00228631, + "balance_loss_mlp": 1.00100994, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4546395459644081, + "language_loss": 0.82035136, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84344673, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.597079038619995 + }, + { + "auxiliary_loss_clip": 0.0115961, + "auxiliary_loss_mlp": 0.01149334, + "balance_loss_clip": 1.00234687, + "balance_loss_mlp": 1.00122797, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.1364113982769175, + "language_loss": 0.80606949, + "learning_rate": 3.578142517422292e-06, + "loss": 0.8291589, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.5335466861724854 + }, + { + "auxiliary_loss_clip": 0.01145602, + "auxiliary_loss_mlp": 0.01149078, + "balance_loss_clip": 1.00227892, + "balance_loss_mlp": 1.00106812, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.532079430354244, + "language_loss": 0.83321774, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85616457, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.5938265323638916 + }, + { + "auxiliary_loss_clip": 0.01160038, + "auxiliary_loss_mlp": 0.01149879, + "balance_loss_clip": 1.00236559, + "balance_loss_mlp": 1.00120068, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.7815682700150375, + "language_loss": 0.7950092, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81810838, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.5334343910217285 + }, + { + "auxiliary_loss_clip": 0.01128495, + "auxiliary_loss_mlp": 0.01148175, + "balance_loss_clip": 1.00238359, + "balance_loss_mlp": 1.00102329, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 2.0129133432216237, + "language_loss": 0.73800659, + "learning_rate": 3.577424507277614e-06, + "loss": 0.7607733, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.6124837398529053 + }, + { + "auxiliary_loss_clip": 0.01128599, + "auxiliary_loss_mlp": 0.0114874, + "balance_loss_clip": 1.00199139, + "balance_loss_mlp": 1.00101554, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.601510166913757, + "language_loss": 0.75179577, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77456915, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.63069748878479 + }, + { + "auxiliary_loss_clip": 0.01115355, + "auxiliary_loss_mlp": 0.01148806, + "balance_loss_clip": 1.00267541, + "balance_loss_mlp": 1.00098634, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.0208978464337766, + "language_loss": 0.66700357, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.68964517, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.6121389865875244 + }, + { + "auxiliary_loss_clip": 0.01111517, + "auxiliary_loss_mlp": 0.01136633, + "balance_loss_clip": 1.00294173, + "balance_loss_mlp": 1.00006723, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.8191142952038044, + "language_loss": 0.58252227, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60500371, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 4.6217546463012695 + }, + { + "auxiliary_loss_clip": 0.01143574, + "auxiliary_loss_mlp": 0.01149151, + "balance_loss_clip": 1.0022161, + "balance_loss_mlp": 1.00094986, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.752192884361888, + "language_loss": 0.8019073, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82483459, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.6309237480163574 + }, + { + "auxiliary_loss_clip": 0.01112013, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.00091362, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 1.8191798971908917, + "language_loss": 0.82321632, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84582567, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.7096331119537354 + }, + { + "auxiliary_loss_clip": 0.01176371, + "auxiliary_loss_mlp": 0.01149354, + "balance_loss_clip": 1.00245023, + "balance_loss_mlp": 1.00143886, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 2.024217213607178, + "language_loss": 0.7172085, + "learning_rate": 3.57598687219895e-06, + "loss": 0.74046576, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.56512451171875 + }, + { + "auxiliary_loss_clip": 0.01176175, + "auxiliary_loss_mlp": 0.01148308, + "balance_loss_clip": 1.00245333, + "balance_loss_mlp": 1.00087023, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.744752548486399, + "language_loss": 0.71292669, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73617148, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.620863914489746 + }, + { + "auxiliary_loss_clip": 0.01159477, + "auxiliary_loss_mlp": 0.01148747, + "balance_loss_clip": 1.00219333, + "balance_loss_mlp": 1.00073695, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.2513050038858204, + "language_loss": 0.73235273, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75543499, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.6377031803131104 + }, + { + "auxiliary_loss_clip": 0.01159341, + "auxiliary_loss_mlp": 0.01148859, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00123072, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.6898323212280437, + "language_loss": 0.73630661, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75938863, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.529595136642456 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_clip": 1.00305986, + "balance_loss_mlp": 1.00024056, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.012555957587903, + "language_loss": 0.73380286, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75660861, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 4.2822585105896 + }, + { + "auxiliary_loss_clip": 0.01160305, + "auxiliary_loss_mlp": 0.01148649, + "balance_loss_clip": 1.00225902, + "balance_loss_mlp": 1.00111604, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5736143514959984, + "language_loss": 0.87949997, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9025895, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 4.049538850784302 + }, + { + "auxiliary_loss_clip": 0.01159752, + "auxiliary_loss_mlp": 0.01149078, + "balance_loss_clip": 1.0023973, + "balance_loss_mlp": 1.00097227, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.181534788043642, + "language_loss": 0.76078504, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78387332, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 3.9158852100372314 + }, + { + "auxiliary_loss_clip": 0.01159605, + "auxiliary_loss_mlp": 0.01148874, + "balance_loss_clip": 1.0023371, + "balance_loss_mlp": 1.00115001, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.5283964034818303, + "language_loss": 0.81558943, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83867425, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.5443718433380127 + }, + { + "auxiliary_loss_clip": 0.01143818, + "auxiliary_loss_mlp": 0.01148462, + "balance_loss_clip": 1.00234175, + "balance_loss_mlp": 1.00111938, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.9242363225332604, + "language_loss": 0.71576393, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73868668, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.608837127685547 + }, + { + "auxiliary_loss_clip": 0.01160523, + "auxiliary_loss_mlp": 0.00748567, + "balance_loss_clip": 1.0024035, + "balance_loss_mlp": 1.00075197, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.599536346341962, + "language_loss": 0.7616291, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78072006, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.588823080062866 + }, + { + "auxiliary_loss_clip": 0.01095206, + "auxiliary_loss_mlp": 0.0114843, + "balance_loss_clip": 1.00213683, + "balance_loss_mlp": 1.00089622, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.5098883425295004, + "language_loss": 0.89581466, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91825104, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.6719932556152344 + }, + { + "auxiliary_loss_clip": 0.01147386, + "auxiliary_loss_mlp": 0.01136604, + "balance_loss_clip": 1.00345433, + "balance_loss_mlp": 1.00003767, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8063695785933428, + "language_loss": 0.59411991, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61695975, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.122154712677002 + }, + { + "auxiliary_loss_clip": 0.01109027, + "auxiliary_loss_mlp": 0.01136659, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00009274, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7754596213548007, + "language_loss": 0.49447781, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51693469, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.316272020339966 + }, + { + "auxiliary_loss_clip": 0.01127154, + "auxiliary_loss_mlp": 0.01149509, + "balance_loss_clip": 1.0021981, + "balance_loss_mlp": 1.00140309, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000832016931038, + "language_loss": 0.76590848, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78867507, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.642744779586792 + }, + { + "auxiliary_loss_clip": 0.01113115, + "auxiliary_loss_mlp": 0.01149293, + "balance_loss_clip": 1.0022552, + "balance_loss_mlp": 1.00118756, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 1.7045870572948434, + "language_loss": 0.69438654, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71701062, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.6669199466705322 + }, + { + "auxiliary_loss_clip": 0.01132692, + "auxiliary_loss_mlp": 0.01148377, + "balance_loss_clip": 1.00307441, + "balance_loss_mlp": 1.0011301, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.7135923061377316, + "language_loss": 0.70654774, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72935843, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.7595877647399902 + }, + { + "auxiliary_loss_clip": 0.01142958, + "auxiliary_loss_mlp": 0.01149028, + "balance_loss_clip": 1.00236201, + "balance_loss_mlp": 1.00130367, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.5862521268508316, + "language_loss": 0.7714566, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79437649, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.621462345123291 + }, + { + "auxiliary_loss_clip": 0.01126949, + "auxiliary_loss_mlp": 0.0114896, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.0009501, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 3.182746920781611, + "language_loss": 0.74608505, + "learning_rate": 3.571901895946612e-06, + "loss": 0.76884407, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.6773712635040283 + }, + { + "auxiliary_loss_clip": 0.01142843, + "auxiliary_loss_mlp": 0.01147968, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00081587, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.0246081779147365, + "language_loss": 0.80419827, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82710636, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.6090312004089355 + }, + { + "auxiliary_loss_clip": 0.01111764, + "auxiliary_loss_mlp": 0.01149433, + "balance_loss_clip": 1.00227869, + "balance_loss_mlp": 1.00123239, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.8198296920607808, + "language_loss": 0.75011182, + "learning_rate": 3.571420177111754e-06, + "loss": 0.77272379, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.6541738510131836 + }, + { + "auxiliary_loss_clip": 0.01176376, + "auxiliary_loss_mlp": 0.01148929, + "balance_loss_clip": 1.00255299, + "balance_loss_mlp": 1.00101399, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 3.7981808156658876, + "language_loss": 0.82519233, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84844542, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.4837679862976074 + }, + { + "auxiliary_loss_clip": 0.0114872, + "auxiliary_loss_mlp": 0.01149575, + "balance_loss_clip": 1.00287032, + "balance_loss_mlp": 1.00127852, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.9056747440729493, + "language_loss": 0.59425759, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61724055, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.594961404800415 + }, + { + "auxiliary_loss_clip": 0.01159771, + "auxiliary_loss_mlp": 0.01148069, + "balance_loss_clip": 1.00236225, + "balance_loss_mlp": 1.00101233, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.8404828019526998, + "language_loss": 0.71808541, + "learning_rate": 3.570697151969235e-06, + "loss": 0.74116385, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.5851078033447266 + }, + { + "auxiliary_loss_clip": 0.01144057, + "auxiliary_loss_mlp": 0.01148625, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00137794, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8275479265294892, + "language_loss": 0.75521123, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77813804, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 2.6170151233673096 + }, + { + "auxiliary_loss_clip": 0.01143089, + "auxiliary_loss_mlp": 0.01148975, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00106049, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.4447528536609164, + "language_loss": 0.81783855, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84075916, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.5482099056243896 + }, + { + "auxiliary_loss_clip": 0.01176251, + "auxiliary_loss_mlp": 0.01149182, + "balance_loss_clip": 1.00234437, + "balance_loss_mlp": 1.00136304, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 1.7240814113791905, + "language_loss": 0.71930337, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74255776, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.535973310470581 + }, + { + "auxiliary_loss_clip": 0.01176121, + "auxiliary_loss_mlp": 0.01148596, + "balance_loss_clip": 1.00243664, + "balance_loss_mlp": 1.0009675, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.7853415116167457, + "language_loss": 0.74121356, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76446074, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.6618974208831787 + }, + { + "auxiliary_loss_clip": 0.01159536, + "auxiliary_loss_mlp": 0.01148625, + "balance_loss_clip": 1.00231528, + "balance_loss_mlp": 1.0009011, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 1.9701967579062587, + "language_loss": 0.80699944, + "learning_rate": 3.569490918967136e-06, + "loss": 0.83008105, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.596149444580078 + }, + { + "auxiliary_loss_clip": 0.01127143, + "auxiliary_loss_mlp": 0.01147966, + "balance_loss_clip": 1.0022577, + "balance_loss_mlp": 1.00090885, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 2.0711445258005123, + "language_loss": 0.85159445, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87434554, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.675157308578491 + }, + { + "auxiliary_loss_clip": 0.011175, + "auxiliary_loss_mlp": 0.01148355, + "balance_loss_clip": 1.00268865, + "balance_loss_mlp": 1.0009166, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 1.840688470240928, + "language_loss": 0.82436824, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84702682, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.718752384185791 + }, + { + "auxiliary_loss_clip": 0.01176211, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_clip": 1.00246334, + "balance_loss_mlp": 1.0010277, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 2.3211914188173735, + "language_loss": 0.7900629, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.81330967, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.01159406, + "auxiliary_loss_mlp": 0.01146735, + "balance_loss_clip": 1.00228119, + "balance_loss_mlp": 1.00110865, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 2.250288297410426, + "language_loss": 0.79679096, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81985235, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.723656415939331 + }, + { + "auxiliary_loss_clip": 0.01143595, + "auxiliary_loss_mlp": 0.01147297, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00100327, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.5885325996705844, + "language_loss": 0.79366541, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81657434, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.623481035232544 + }, + { + "auxiliary_loss_clip": 0.01159346, + "auxiliary_loss_mlp": 0.01147434, + "balance_loss_clip": 1.00246716, + "balance_loss_mlp": 1.00104499, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 1.9035438165283978, + "language_loss": 0.85431933, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87738717, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.5239453315734863 + }, + { + "auxiliary_loss_clip": 0.01176096, + "auxiliary_loss_mlp": 0.01148161, + "balance_loss_clip": 1.00249338, + "balance_loss_mlp": 1.00100899, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.1578391653511773, + "language_loss": 0.94591564, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96915817, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 3.93742299079895 + }, + { + "auxiliary_loss_clip": 0.01176087, + "auxiliary_loss_mlp": 0.01148112, + "balance_loss_clip": 1.00237775, + "balance_loss_mlp": 1.00124669, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.5958346376358774, + "language_loss": 0.82010877, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84335077, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.5745012760162354 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.00748453, + "balance_loss_clip": 1.00241458, + "balance_loss_mlp": 1.00060248, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.404300638566651, + "language_loss": 0.89107585, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91004443, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.555208683013916 + }, + { + "auxiliary_loss_clip": 0.01175938, + "auxiliary_loss_mlp": 0.01147681, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00100625, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.5367288664332226, + "language_loss": 0.84588343, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86911964, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.459853410720825 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01148543, + "balance_loss_clip": 1.0024035, + "balance_loss_mlp": 1.00091445, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.771553742316391, + "language_loss": 0.80956137, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83233333, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.636242151260376 + }, + { + "auxiliary_loss_clip": 0.01144579, + "auxiliary_loss_mlp": 0.01148586, + "balance_loss_clip": 1.00231504, + "balance_loss_mlp": 1.00095701, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.5258085627323568, + "language_loss": 0.67511451, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69804615, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.526797294616699 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01148263, + "balance_loss_clip": 1.00221598, + "balance_loss_mlp": 1.00082445, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 4.283379042398813, + "language_loss": 0.75465834, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77757967, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.556443452835083 + }, + { + "auxiliary_loss_clip": 0.01160816, + "auxiliary_loss_mlp": 0.01148055, + "balance_loss_clip": 1.00237441, + "balance_loss_mlp": 1.00080776, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.413111245999568, + "language_loss": 0.63583601, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.6589247, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.01160614, + "auxiliary_loss_mlp": 0.01148202, + "balance_loss_clip": 1.00240254, + "balance_loss_mlp": 1.00085938, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.0865339472426836, + "language_loss": 0.77310455, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79619265, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 3.936366081237793 + }, + { + "auxiliary_loss_clip": 0.01160237, + "auxiliary_loss_mlp": 0.01148273, + "balance_loss_clip": 1.00244093, + "balance_loss_mlp": 1.00093019, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.4575679322261905, + "language_loss": 0.80863822, + "learning_rate": 3.565620980442944e-06, + "loss": 0.83172333, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 4.0527424812316895 + }, + { + "auxiliary_loss_clip": 0.01143442, + "auxiliary_loss_mlp": 0.01147369, + "balance_loss_clip": 1.00216854, + "balance_loss_mlp": 1.00107503, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.1169684727610747, + "language_loss": 0.80227673, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82518482, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 4.10293173789978 + }, + { + "auxiliary_loss_clip": 0.01143207, + "auxiliary_loss_mlp": 0.01148159, + "balance_loss_clip": 1.00248432, + "balance_loss_mlp": 1.00091171, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.6771206568713897, + "language_loss": 0.72870612, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75161976, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.59077525138855 + }, + { + "auxiliary_loss_clip": 0.01175999, + "auxiliary_loss_mlp": 0.01147468, + "balance_loss_clip": 1.00231564, + "balance_loss_mlp": 1.00098348, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.7973678513560243, + "language_loss": 0.73399949, + "learning_rate": 3.564893673833495e-06, + "loss": 0.7572341, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.514601469039917 + }, + { + "auxiliary_loss_clip": 0.01143036, + "auxiliary_loss_mlp": 0.01148104, + "balance_loss_clip": 1.00237513, + "balance_loss_mlp": 1.00095224, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.8458057955954486, + "language_loss": 0.73554462, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75845599, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.5670299530029297 + }, + { + "auxiliary_loss_clip": 0.01128444, + "auxiliary_loss_mlp": 0.0114787, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00071764, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.7245115516007024, + "language_loss": 0.7088052, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73156834, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.6890194416046143 + }, + { + "auxiliary_loss_clip": 0.01176151, + "auxiliary_loss_mlp": 0.01147747, + "balance_loss_clip": 1.00245917, + "balance_loss_mlp": 1.00097644, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.973369980357429, + "language_loss": 0.81577718, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.8390162, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.6823654174804688 + }, + { + "auxiliary_loss_clip": 0.0114211, + "auxiliary_loss_mlp": 0.01148355, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00101185, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 3.332337107959696, + "language_loss": 0.66041899, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68332368, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.6301121711730957 + }, + { + "auxiliary_loss_clip": 0.0117602, + "auxiliary_loss_mlp": 0.01147758, + "balance_loss_clip": 1.00233448, + "balance_loss_mlp": 1.00117791, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3511645356766975, + "language_loss": 0.83927321, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86251104, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.5361227989196777 + }, + { + "auxiliary_loss_clip": 0.01127192, + "auxiliary_loss_mlp": 0.01147216, + "balance_loss_clip": 1.00217092, + "balance_loss_mlp": 1.00101793, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 1.9031223613982795, + "language_loss": 0.85069394, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87343812, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.6115856170654297 + }, + { + "auxiliary_loss_clip": 0.01093465, + "auxiliary_loss_mlp": 0.01146869, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00114751, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 1.9535517788523313, + "language_loss": 0.70147282, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72387612, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.7117440700531006 + }, + { + "auxiliary_loss_clip": 0.01128479, + "auxiliary_loss_mlp": 0.01148143, + "balance_loss_clip": 1.00221181, + "balance_loss_mlp": 1.00080061, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.441544968216536, + "language_loss": 0.65726143, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68002766, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.622295618057251 + }, + { + "auxiliary_loss_clip": 0.0111527, + "auxiliary_loss_mlp": 0.01147596, + "balance_loss_clip": 1.00236297, + "balance_loss_mlp": 1.0007298, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.636878202709, + "language_loss": 0.72075689, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74338555, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.6471478939056396 + }, + { + "auxiliary_loss_clip": 0.0106177, + "auxiliary_loss_mlp": 0.01148171, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.0009234, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.6227438375435426, + "language_loss": 0.74268132, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76478076, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.792320966720581 + }, + { + "auxiliary_loss_clip": 0.01176007, + "auxiliary_loss_mlp": 0.01148176, + "balance_loss_clip": 1.00228906, + "balance_loss_mlp": 1.00092864, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 1.8330624825727064, + "language_loss": 0.65685099, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68009281, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.743152618408203 + }, + { + "auxiliary_loss_clip": 0.01143912, + "auxiliary_loss_mlp": 0.01148197, + "balance_loss_clip": 1.0022155, + "balance_loss_mlp": 1.00123549, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7544910407288234, + "language_loss": 0.74478137, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770246, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.603738307952881 + }, + { + "auxiliary_loss_clip": 0.01126274, + "auxiliary_loss_mlp": 0.0114901, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00109529, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.6935054096084947, + "language_loss": 0.77658379, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79933667, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.6300694942474365 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01147825, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00114942, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0107958648195248, + "language_loss": 0.71293098, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73573768, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.5985774993896484 + }, + { + "auxiliary_loss_clip": 0.01145225, + "auxiliary_loss_mlp": 0.01147835, + "balance_loss_clip": 1.00231981, + "balance_loss_mlp": 1.00068295, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.9215764139838787, + "language_loss": 0.78010106, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80303174, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.5487568378448486 + }, + { + "auxiliary_loss_clip": 0.01143818, + "auxiliary_loss_mlp": 0.01147564, + "balance_loss_clip": 1.00220466, + "balance_loss_mlp": 1.00098419, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.7811753859481345, + "language_loss": 0.68606925, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70898306, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.5767223834991455 + }, + { + "auxiliary_loss_clip": 0.01127574, + "auxiliary_loss_mlp": 0.01148392, + "balance_loss_clip": 1.00223136, + "balance_loss_mlp": 1.00114465, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.9911555072027498, + "language_loss": 0.6866017, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70936143, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.606539249420166 + }, + { + "auxiliary_loss_clip": 0.01109771, + "auxiliary_loss_mlp": 0.01147434, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00104547, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.787726687226825, + "language_loss": 0.76307875, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78565079, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.717737913131714 + }, + { + "auxiliary_loss_clip": 0.01148062, + "auxiliary_loss_mlp": 0.01147567, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00098777, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 1.986601394854504, + "language_loss": 0.76915151, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.79210782, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.5609018802642822 + }, + { + "auxiliary_loss_clip": 0.0112779, + "auxiliary_loss_mlp": 0.01147662, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00098634, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 2.840657003911415, + "language_loss": 0.85042894, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87318343, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.6530802249908447 + }, + { + "auxiliary_loss_clip": 0.01158338, + "auxiliary_loss_mlp": 0.01135871, + "balance_loss_clip": 1.00262308, + "balance_loss_mlp": 1.00006735, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.9339505861853586, + "language_loss": 0.62808049, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65102255, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.1939971446990967 + }, + { + "auxiliary_loss_clip": 0.01142453, + "auxiliary_loss_mlp": 0.01147915, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.00085807, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.938404349227396, + "language_loss": 0.8179552, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.84085888, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.5874102115631104 + }, + { + "auxiliary_loss_clip": 0.0114392, + "auxiliary_loss_mlp": 0.01148022, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00106049, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.5851663481731533, + "language_loss": 0.79053688, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8134563, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.592607259750366 + }, + { + "auxiliary_loss_clip": 0.01160533, + "auxiliary_loss_mlp": 0.01147847, + "balance_loss_clip": 1.0023315, + "balance_loss_mlp": 1.00117242, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 2.3074860072074994, + "language_loss": 0.84371316, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86679697, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.5135066509246826 + }, + { + "auxiliary_loss_clip": 0.01160637, + "auxiliary_loss_mlp": 0.01147233, + "balance_loss_clip": 1.00220466, + "balance_loss_mlp": 1.00074863, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.24664976109956, + "language_loss": 0.83862007, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.86169881, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.540940046310425 + }, + { + "auxiliary_loss_clip": 0.0109367, + "auxiliary_loss_mlp": 0.01147216, + "balance_loss_clip": 1.00179875, + "balance_loss_mlp": 1.00082707, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.6724974620667044, + "language_loss": 0.7450335, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76744229, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 4.17548394203186 + }, + { + "auxiliary_loss_clip": 0.011758, + "auxiliary_loss_mlp": 0.01147559, + "balance_loss_clip": 1.00227571, + "balance_loss_mlp": 1.00097919, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.8791096249311554, + "language_loss": 0.72156203, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74479556, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.517643928527832 + }, + { + "auxiliary_loss_clip": 0.01145217, + "auxiliary_loss_mlp": 0.01148356, + "balance_loss_clip": 1.00241077, + "balance_loss_mlp": 1.00101352, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 1.9591619079195939, + "language_loss": 0.78643155, + "learning_rate": 3.558079758168997e-06, + "loss": 0.8093673, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.5906124114990234 + }, + { + "auxiliary_loss_clip": 0.01143586, + "auxiliary_loss_mlp": 0.01147227, + "balance_loss_clip": 1.00213945, + "balance_loss_mlp": 1.00112462, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.661717511303738, + "language_loss": 0.8198874, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84279555, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.628891706466675 + }, + { + "auxiliary_loss_clip": 0.01127387, + "auxiliary_loss_mlp": 0.01147014, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00100684, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 2.0304404579870403, + "language_loss": 0.84170383, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86444783, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.6230597496032715 + }, + { + "auxiliary_loss_clip": 0.01143923, + "auxiliary_loss_mlp": 0.01148286, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.00122976, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 6.591022741659904, + "language_loss": 0.77012908, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79305112, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.6319453716278076 + }, + { + "auxiliary_loss_clip": 0.01127025, + "auxiliary_loss_mlp": 0.0114703, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00092673, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 1.8241070507380215, + "language_loss": 0.78240097, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80514151, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.598627805709839 + }, + { + "auxiliary_loss_clip": 0.01158999, + "auxiliary_loss_mlp": 0.00748473, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.0008347, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.6500975078783089, + "language_loss": 0.73317492, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75224972, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.591538190841675 + }, + { + "auxiliary_loss_clip": 0.01127341, + "auxiliary_loss_mlp": 0.01147968, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00091171, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.0134871971697477, + "language_loss": 0.79109204, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.8138451, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 4.12801194190979 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.01147795, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00150144, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.1469224674198304, + "language_loss": 0.73293149, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75553763, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.7030813694000244 + }, + { + "auxiliary_loss_clip": 0.01159327, + "auxiliary_loss_mlp": 0.01147988, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00121748, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.7340505868207285, + "language_loss": 0.87678128, + "learning_rate": 3.556124408363871e-06, + "loss": 0.89985442, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 4.005195140838623 + }, + { + "auxiliary_loss_clip": 0.01159051, + "auxiliary_loss_mlp": 0.01147162, + "balance_loss_clip": 1.00224876, + "balance_loss_mlp": 1.00096369, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.1392166409964117, + "language_loss": 0.83314067, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85620284, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.485520124435425 + }, + { + "auxiliary_loss_clip": 0.01159105, + "auxiliary_loss_mlp": 0.01146906, + "balance_loss_clip": 1.00205421, + "balance_loss_mlp": 1.00089812, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.5776938414398953, + "language_loss": 0.85351372, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.8765738, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.502767324447632 + }, + { + "auxiliary_loss_clip": 0.01175776, + "auxiliary_loss_mlp": 0.01146716, + "balance_loss_clip": 1.00226259, + "balance_loss_mlp": 1.00089896, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.25910074280374, + "language_loss": 0.84536302, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86858797, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.448154926300049 + }, + { + "auxiliary_loss_clip": 0.01160496, + "auxiliary_loss_mlp": 0.01147139, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00103557, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.7336084541126027, + "language_loss": 0.75986332, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78293967, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.6206214427948 + }, + { + "auxiliary_loss_clip": 0.01141752, + "auxiliary_loss_mlp": 0.01136008, + "balance_loss_clip": 1.00238192, + "balance_loss_mlp": 1.00020468, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8887290735745587, + "language_loss": 0.63707972, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65985727, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.0857155323028564 + }, + { + "auxiliary_loss_clip": 0.01157288, + "auxiliary_loss_mlp": 0.01135857, + "balance_loss_clip": 1.00252056, + "balance_loss_mlp": 1.00005376, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7538529441096652, + "language_loss": 0.62957048, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65250194, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.2504477500915527 + }, + { + "auxiliary_loss_clip": 0.01127617, + "auxiliary_loss_mlp": 0.01147964, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00100243, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.7095032436074589, + "language_loss": 0.77041876, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79317462, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.7497997283935547 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01147809, + "balance_loss_clip": 1.00217259, + "balance_loss_mlp": 1.00113428, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.717361855493885, + "language_loss": 0.78378654, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80670428, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.6307666301727295 + }, + { + "auxiliary_loss_clip": 0.01124247, + "auxiliary_loss_mlp": 0.01136094, + "balance_loss_clip": 1.00218117, + "balance_loss_mlp": 1.00029111, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9029193994152864, + "language_loss": 0.63535315, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65795654, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.231133222579956 + }, + { + "auxiliary_loss_clip": 0.01142115, + "auxiliary_loss_mlp": 0.01147724, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00104856, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.3694525956725068, + "language_loss": 0.70225811, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72515655, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.579646348953247 + }, + { + "auxiliary_loss_clip": 0.0116037, + "auxiliary_loss_mlp": 0.01146561, + "balance_loss_clip": 1.00224686, + "balance_loss_mlp": 1.00093484, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.8260045604150519, + "language_loss": 0.86754298, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.8906123, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.577862024307251 + }, + { + "auxiliary_loss_clip": 0.01160559, + "auxiliary_loss_mlp": 0.01148081, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00073791, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.5792028840827086, + "language_loss": 0.76021963, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.783306, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.5563361644744873 + }, + { + "auxiliary_loss_clip": 0.01149318, + "auxiliary_loss_mlp": 0.01147248, + "balance_loss_clip": 1.00261116, + "balance_loss_mlp": 1.00095475, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.6463074027730014, + "language_loss": 0.72462082, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74758649, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.6317691802978516 + }, + { + "auxiliary_loss_clip": 0.01159251, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_clip": 1.00224686, + "balance_loss_mlp": 1.0008651, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.6204983229763645, + "language_loss": 0.66647333, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.6895355, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.5917439460754395 + }, + { + "auxiliary_loss_clip": 0.0117586, + "auxiliary_loss_mlp": 0.01147688, + "balance_loss_clip": 1.00233757, + "balance_loss_mlp": 1.0009172, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 1.8296757662025749, + "language_loss": 0.83000886, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85324436, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.5429606437683105 + }, + { + "auxiliary_loss_clip": 0.01127407, + "auxiliary_loss_mlp": 0.01147068, + "balance_loss_clip": 1.00201869, + "balance_loss_mlp": 1.00096488, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8831092037275536, + "language_loss": 0.83479959, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8575443, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.736567974090576 + }, + { + "auxiliary_loss_clip": 0.01144189, + "auxiliary_loss_mlp": 0.01148693, + "balance_loss_clip": 1.00227547, + "balance_loss_mlp": 1.00096917, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0062097303305144, + "language_loss": 0.87496436, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89789313, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.561525344848633 + }, + { + "auxiliary_loss_clip": 0.01143861, + "auxiliary_loss_mlp": 0.01147964, + "balance_loss_clip": 1.00203502, + "balance_loss_mlp": 1.00109792, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.8792647511532068, + "language_loss": 0.77644038, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79935873, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.625420570373535 + }, + { + "auxiliary_loss_clip": 0.01111289, + "auxiliary_loss_mlp": 0.01146797, + "balance_loss_clip": 1.00203979, + "balance_loss_mlp": 1.00088441, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.503688592482291, + "language_loss": 0.79011345, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81269425, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.724515914916992 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.00748528, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00074899, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 5.180331171648351, + "language_loss": 0.71109205, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73002422, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.69516658782959 + }, + { + "auxiliary_loss_clip": 0.01128139, + "auxiliary_loss_mlp": 0.01146942, + "balance_loss_clip": 1.00221896, + "balance_loss_mlp": 1.00112486, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.625565891445095, + "language_loss": 0.76189697, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78464776, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.6470119953155518 + }, + { + "auxiliary_loss_clip": 0.01159134, + "auxiliary_loss_mlp": 0.01147153, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00076365, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.3538104428442423, + "language_loss": 0.75103372, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77409661, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.5279903411865234 + }, + { + "auxiliary_loss_clip": 0.0115905, + "auxiliary_loss_mlp": 0.01146621, + "balance_loss_clip": 1.00226545, + "balance_loss_mlp": 1.00089908, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 2.3433809056726393, + "language_loss": 0.80067909, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82373583, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.534742593765259 + }, + { + "auxiliary_loss_clip": 0.01142646, + "auxiliary_loss_mlp": 0.01148329, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.00108218, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.1470519629865827, + "language_loss": 0.70784223, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.73075199, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.621607780456543 + }, + { + "auxiliary_loss_clip": 0.0107852, + "auxiliary_loss_mlp": 0.0114701, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00081193, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.5653012743513832, + "language_loss": 0.6944775, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71673274, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.722144842147827 + }, + { + "auxiliary_loss_clip": 0.01159566, + "auxiliary_loss_mlp": 0.01148124, + "balance_loss_clip": 1.00237036, + "balance_loss_mlp": 1.00106716, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.5538406185318914, + "language_loss": 0.73735034, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.76042724, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.787895679473877 + }, + { + "auxiliary_loss_clip": 0.01176011, + "auxiliary_loss_mlp": 0.01147458, + "balance_loss_clip": 1.00248003, + "balance_loss_mlp": 1.00078321, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 2.0240221119417705, + "language_loss": 0.88218343, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9054181, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.5207149982452393 + }, + { + "auxiliary_loss_clip": 0.01145441, + "auxiliary_loss_mlp": 0.01147609, + "balance_loss_clip": 1.00226247, + "balance_loss_mlp": 1.00102949, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 1.8453126913222966, + "language_loss": 0.94659758, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96952808, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 4.092072010040283 + }, + { + "auxiliary_loss_clip": 0.01143743, + "auxiliary_loss_mlp": 0.011476, + "balance_loss_clip": 1.00217664, + "balance_loss_mlp": 1.00092542, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 2.479934035883829, + "language_loss": 0.83147466, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.85438812, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.5971245765686035 + }, + { + "auxiliary_loss_clip": 0.01127037, + "auxiliary_loss_mlp": 0.01147004, + "balance_loss_clip": 1.00206578, + "balance_loss_mlp": 1.00109208, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.6558492925711064, + "language_loss": 0.68715978, + "learning_rate": 3.54875825066639e-06, + "loss": 0.70990014, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.773898124694824 + }, + { + "auxiliary_loss_clip": 0.01160476, + "auxiliary_loss_mlp": 0.01148596, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00115848, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.5886550833005066, + "language_loss": 0.85053718, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.8736279, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.5837595462799072 + }, + { + "auxiliary_loss_clip": 0.01158116, + "auxiliary_loss_mlp": 0.01135121, + "balance_loss_clip": 1.00333583, + "balance_loss_mlp": 1.00008035, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8163600103583257, + "language_loss": 0.60700482, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62993723, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.178473472595215 + }, + { + "auxiliary_loss_clip": 0.011435, + "auxiliary_loss_mlp": 0.01146842, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00112033, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.8549234121295948, + "language_loss": 0.73406541, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.7569688, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.6224875450134277 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01147296, + "balance_loss_clip": 1.00221682, + "balance_loss_mlp": 1.0008111, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.7364621322657738, + "language_loss": 0.81625384, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83898115, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.5986502170562744 + }, + { + "auxiliary_loss_clip": 0.01175906, + "auxiliary_loss_mlp": 0.01147909, + "balance_loss_clip": 1.00238442, + "balance_loss_mlp": 1.00094748, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 2.099086476169054, + "language_loss": 0.76540935, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78864747, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 4.085314512252808 + }, + { + "auxiliary_loss_clip": 0.01128273, + "auxiliary_loss_mlp": 0.01148009, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00085711, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.7009328265030907, + "language_loss": 0.75465143, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77741432, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.660294532775879 + }, + { + "auxiliary_loss_clip": 0.01143444, + "auxiliary_loss_mlp": 0.01147748, + "balance_loss_clip": 1.00217438, + "balance_loss_mlp": 1.00107312, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.945263290358201, + "language_loss": 0.82776862, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.85068059, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 4.047248840332031 + }, + { + "auxiliary_loss_clip": 0.01159238, + "auxiliary_loss_mlp": 0.01147036, + "balance_loss_clip": 1.00237679, + "balance_loss_mlp": 1.00093341, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.7512712090475402, + "language_loss": 0.85617006, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.87923276, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.504277467727661 + }, + { + "auxiliary_loss_clip": 0.01128783, + "auxiliary_loss_mlp": 0.01148375, + "balance_loss_clip": 1.00229859, + "balance_loss_mlp": 1.00093699, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.8320791838803079, + "language_loss": 0.7185939, + "learning_rate": 3.546538084949365e-06, + "loss": 0.74136549, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.5872747898101807 + }, + { + "auxiliary_loss_clip": 0.01160376, + "auxiliary_loss_mlp": 0.01146626, + "balance_loss_clip": 1.00233436, + "balance_loss_mlp": 1.0009048, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.9973099782142492, + "language_loss": 0.64598441, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66905445, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.5105960369110107 + }, + { + "auxiliary_loss_clip": 0.01159113, + "auxiliary_loss_mlp": 0.00748393, + "balance_loss_clip": 1.00227809, + "balance_loss_mlp": 1.0006063, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.40263828683696, + "language_loss": 0.70539707, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72447211, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.5488598346710205 + }, + { + "auxiliary_loss_clip": 0.01159076, + "auxiliary_loss_mlp": 0.01135085, + "balance_loss_clip": 1.00347495, + "balance_loss_mlp": 1.00004518, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8548633246667886, + "language_loss": 0.55316907, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57611072, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.120601177215576 + }, + { + "auxiliary_loss_clip": 0.01160546, + "auxiliary_loss_mlp": 0.01147105, + "balance_loss_clip": 1.00238585, + "balance_loss_mlp": 1.00090694, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.727031408027626, + "language_loss": 0.74688816, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76996458, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.5734071731567383 + }, + { + "auxiliary_loss_clip": 0.0117578, + "auxiliary_loss_mlp": 0.0114768, + "balance_loss_clip": 1.0023278, + "balance_loss_mlp": 1.00119615, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 1.7628808980722424, + "language_loss": 0.76693606, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79017067, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.5158169269561768 + }, + { + "auxiliary_loss_clip": 0.01143969, + "auxiliary_loss_mlp": 0.00748534, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00080538, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.4447005228621213, + "language_loss": 0.65760851, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67653358, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.596203565597534 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01147117, + "balance_loss_clip": 1.00214803, + "balance_loss_mlp": 1.0009191, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 1.8877120177007871, + "language_loss": 0.81322908, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.8362897, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.5112357139587402 + }, + { + "auxiliary_loss_clip": 0.01134084, + "auxiliary_loss_mlp": 0.01146572, + "balance_loss_clip": 1.00253451, + "balance_loss_mlp": 1.00075459, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 1.851393521928287, + "language_loss": 0.69097912, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71378571, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.01142937, + "auxiliary_loss_mlp": 0.01146943, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00064981, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4451333159273747, + "language_loss": 0.96376348, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98666233, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.553905487060547 + }, + { + "auxiliary_loss_clip": 0.01142277, + "auxiliary_loss_mlp": 0.01145995, + "balance_loss_clip": 1.00220633, + "balance_loss_mlp": 1.00113225, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.6737701337369897, + "language_loss": 0.78217459, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80505729, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.62693190574646 + }, + { + "auxiliary_loss_clip": 0.01159322, + "auxiliary_loss_mlp": 0.01147159, + "balance_loss_clip": 1.00226378, + "balance_loss_mlp": 1.00096071, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.6334981044317713, + "language_loss": 0.74189627, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76496106, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.565324544906616 + }, + { + "auxiliary_loss_clip": 0.01134132, + "auxiliary_loss_mlp": 0.01146468, + "balance_loss_clip": 1.00224495, + "balance_loss_mlp": 1.00084233, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 1.8244134115380166, + "language_loss": 0.76877737, + "learning_rate": 3.543570475921171e-06, + "loss": 0.79158342, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.6754767894744873 + }, + { + "auxiliary_loss_clip": 0.01159476, + "auxiliary_loss_mlp": 0.01147114, + "balance_loss_clip": 1.00239062, + "balance_loss_mlp": 1.00101137, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.682363112469979, + "language_loss": 0.72171038, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74477637, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.5390632152557373 + }, + { + "auxiliary_loss_clip": 0.01160179, + "auxiliary_loss_mlp": 0.01146999, + "balance_loss_clip": 1.00224113, + "balance_loss_mlp": 1.00070548, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.822327890440421, + "language_loss": 0.78519052, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80826229, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.578542947769165 + }, + { + "auxiliary_loss_clip": 0.01108837, + "auxiliary_loss_mlp": 0.01146182, + "balance_loss_clip": 1.00196445, + "balance_loss_mlp": 1.00084221, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 2.725739899874767, + "language_loss": 0.80783379, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.83038396, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.687992572784424 + }, + { + "auxiliary_loss_clip": 0.01127941, + "auxiliary_loss_mlp": 0.01146176, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00093162, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.385827437018684, + "language_loss": 0.77052522, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79326636, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.673569679260254 + }, + { + "auxiliary_loss_clip": 0.01067255, + "auxiliary_loss_mlp": 0.01146817, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00071371, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.8688857772653436, + "language_loss": 0.81899625, + "learning_rate": 3.542331483604246e-06, + "loss": 0.84113705, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.812102794647217 + }, + { + "auxiliary_loss_clip": 0.01143525, + "auxiliary_loss_mlp": 0.01146941, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00074291, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.452025167687263, + "language_loss": 0.72715902, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75006378, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.5578348636627197 + }, + { + "auxiliary_loss_clip": 0.01158961, + "auxiliary_loss_mlp": 0.01146675, + "balance_loss_clip": 1.00232315, + "balance_loss_mlp": 1.00104952, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.9960436049684944, + "language_loss": 0.83837736, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.86143374, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.5625650882720947 + }, + { + "auxiliary_loss_clip": 0.01076357, + "auxiliary_loss_mlp": 0.01146706, + "balance_loss_clip": 1.0018028, + "balance_loss_mlp": 1.00098467, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.76312329448621, + "language_loss": 0.87030816, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89253879, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.7288050651550293 + }, + { + "auxiliary_loss_clip": 0.01143041, + "auxiliary_loss_mlp": 0.01146371, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00084031, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 2.036563892368149, + "language_loss": 0.72836226, + "learning_rate": 3.5413392369578e-06, + "loss": 0.75125635, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.586003303527832 + }, + { + "auxiliary_loss_clip": 0.01160477, + "auxiliary_loss_mlp": 0.01147056, + "balance_loss_clip": 1.00227284, + "balance_loss_mlp": 1.00085807, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.7618865547157725, + "language_loss": 0.73378295, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75685829, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.606313467025757 + }, + { + "auxiliary_loss_clip": 0.0112536, + "auxiliary_loss_mlp": 0.0114682, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.0010035, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 1.966804763558724, + "language_loss": 0.73558676, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75830859, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.5859804153442383 + }, + { + "auxiliary_loss_clip": 0.01126672, + "auxiliary_loss_mlp": 0.01146934, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.0010221, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.5966999998517324, + "language_loss": 0.73434919, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75708526, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.6094818115234375 + }, + { + "auxiliary_loss_clip": 0.0114209, + "auxiliary_loss_mlp": 0.01146033, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.0008837, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 2.1844946513306103, + "language_loss": 0.74484122, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.76772249, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.6319539546966553 + }, + { + "auxiliary_loss_clip": 0.01110262, + "auxiliary_loss_mlp": 0.01146987, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00078821, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.7947177079370453, + "language_loss": 0.70204508, + "learning_rate": 3.540097613646296e-06, + "loss": 0.7246176, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 4.150786638259888 + }, + { + "auxiliary_loss_clip": 0.01142214, + "auxiliary_loss_mlp": 0.01147185, + "balance_loss_clip": 1.00223517, + "balance_loss_mlp": 1.0009867, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.5588860918152057, + "language_loss": 0.81235391, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83524793, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.5941579341888428 + }, + { + "auxiliary_loss_clip": 0.01175818, + "auxiliary_loss_mlp": 0.01146133, + "balance_loss_clip": 1.00231659, + "balance_loss_mlp": 1.00079274, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5893535190522288, + "language_loss": 0.78135556, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80457509, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.474520206451416 + }, + { + "auxiliary_loss_clip": 0.01128435, + "auxiliary_loss_mlp": 0.01146272, + "balance_loss_clip": 1.00206423, + "balance_loss_mlp": 1.00112247, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.51776674277972, + "language_loss": 0.83997267, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86271977, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.6338436603546143 + }, + { + "auxiliary_loss_clip": 0.01127054, + "auxiliary_loss_mlp": 0.01147277, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00088787, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 2.758433242211568, + "language_loss": 0.54874098, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57148421, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.6826331615448 + }, + { + "auxiliary_loss_clip": 0.01160012, + "auxiliary_loss_mlp": 0.01147423, + "balance_loss_clip": 1.00231814, + "balance_loss_mlp": 1.00112963, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.306984294620146, + "language_loss": 0.79875541, + "learning_rate": 3.538854530318506e-06, + "loss": 0.8218298, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.5489020347595215 + }, + { + "auxiliary_loss_clip": 0.01159418, + "auxiliary_loss_mlp": 0.0114697, + "balance_loss_clip": 1.00232947, + "balance_loss_mlp": 1.00096238, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 2.0673648736184504, + "language_loss": 0.79374915, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81681305, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.540602922439575 + }, + { + "auxiliary_loss_clip": 0.01175723, + "auxiliary_loss_mlp": 0.01147191, + "balance_loss_clip": 1.00221276, + "balance_loss_mlp": 1.00099254, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.7605272290296439, + "language_loss": 0.85961175, + "learning_rate": 3.538356888446756e-06, + "loss": 0.88284087, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 4.022033214569092 + }, + { + "auxiliary_loss_clip": 0.01159335, + "auxiliary_loss_mlp": 0.01146075, + "balance_loss_clip": 1.00233448, + "balance_loss_mlp": 1.0006392, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.5061958414825876, + "language_loss": 0.74119806, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76425219, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01147655, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00117087, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 2.551824343675508, + "language_loss": 0.73733509, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.76006818, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.66108775138855 + }, + { + "auxiliary_loss_clip": 0.01175692, + "auxiliary_loss_mlp": 0.01146863, + "balance_loss_clip": 1.00237799, + "balance_loss_mlp": 1.0010463, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.8564186910066454, + "language_loss": 0.76271141, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78593695, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 3.980191230773926 + }, + { + "auxiliary_loss_clip": 0.01128108, + "auxiliary_loss_mlp": 0.01147009, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00109637, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 2.0811204137553823, + "language_loss": 0.85161877, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87436992, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.6600229740142822 + }, + { + "auxiliary_loss_clip": 0.01143626, + "auxiliary_loss_mlp": 0.01147767, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00090098, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.4512028232212524, + "language_loss": 0.68559134, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70850527, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.6015663146972656 + }, + { + "auxiliary_loss_clip": 0.01159995, + "auxiliary_loss_mlp": 0.01147185, + "balance_loss_clip": 1.00225544, + "balance_loss_mlp": 1.00089157, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 2.070171131369197, + "language_loss": 0.70058382, + "learning_rate": 3.536862563102088e-06, + "loss": 0.7236557, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.63333797454834 + }, + { + "auxiliary_loss_clip": 0.01175744, + "auxiliary_loss_mlp": 0.01147527, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.00113821, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.5999989091173203, + "language_loss": 0.84494334, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86817598, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.514796495437622 + }, + { + "auxiliary_loss_clip": 0.01174632, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_clip": 1.00350428, + "balance_loss_mlp": 1.00020874, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.754595888807346, + "language_loss": 0.52250218, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54560095, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 2.995327949523926 + }, + { + "auxiliary_loss_clip": 0.01142448, + "auxiliary_loss_mlp": 0.01147045, + "balance_loss_clip": 1.0021565, + "balance_loss_mlp": 1.00094199, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 2.508870535244975, + "language_loss": 0.72114003, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74403501, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.558387517929077 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.0114662, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00118482, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.4994404861405175, + "language_loss": 0.77886409, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.80144566, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.6949522495269775 + }, + { + "auxiliary_loss_clip": 0.01142279, + "auxiliary_loss_mlp": 0.01146677, + "balance_loss_clip": 1.00229967, + "balance_loss_mlp": 1.00124145, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 2.049436435321074, + "language_loss": 0.80645645, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.829346, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.603799343109131 + }, + { + "auxiliary_loss_clip": 0.01160516, + "auxiliary_loss_mlp": 0.01146999, + "balance_loss_clip": 1.00225341, + "balance_loss_mlp": 1.00099134, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.5776519376371811, + "language_loss": 0.84290624, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.8659814, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.575913667678833 + }, + { + "auxiliary_loss_clip": 0.01143484, + "auxiliary_loss_mlp": 0.01147202, + "balance_loss_clip": 1.00218606, + "balance_loss_mlp": 1.00119424, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.8315047252883045, + "language_loss": 0.79953927, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82244611, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.5674824714660645 + }, + { + "auxiliary_loss_clip": 0.01158913, + "auxiliary_loss_mlp": 0.01146389, + "balance_loss_clip": 1.0023824, + "balance_loss_mlp": 1.00114477, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 2.2415732635769317, + "language_loss": 0.70099545, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7240485, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.5358939170837402 + }, + { + "auxiliary_loss_clip": 0.01143766, + "auxiliary_loss_mlp": 0.01146691, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00116074, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.8123340284056075, + "language_loss": 0.67600858, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69891316, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.5721852779388428 + }, + { + "auxiliary_loss_clip": 0.0117492, + "auxiliary_loss_mlp": 0.0113436, + "balance_loss_clip": 1.0038805, + "balance_loss_mlp": 1.00008285, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.8945532664509758, + "language_loss": 0.68769306, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.71078587, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.1593093872070312 + }, + { + "auxiliary_loss_clip": 0.01175694, + "auxiliary_loss_mlp": 0.01146365, + "balance_loss_clip": 1.00240397, + "balance_loss_mlp": 1.00131083, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 1.8554227728262278, + "language_loss": 0.7973901, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.82061064, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.525847911834717 + }, + { + "auxiliary_loss_clip": 0.01143518, + "auxiliary_loss_mlp": 0.00748534, + "balance_loss_clip": 1.0022012, + "balance_loss_mlp": 1.00091529, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0605854373505768, + "language_loss": 0.82350123, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84242177, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.59517240524292 + }, + { + "auxiliary_loss_clip": 0.01175808, + "auxiliary_loss_mlp": 0.01146794, + "balance_loss_clip": 1.00239134, + "balance_loss_mlp": 1.00116801, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.709132312087404, + "language_loss": 0.62029409, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64352012, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.5726518630981445 + }, + { + "auxiliary_loss_clip": 0.01142546, + "auxiliary_loss_mlp": 0.01146678, + "balance_loss_clip": 1.00224352, + "balance_loss_mlp": 1.00076604, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 2.270492404202426, + "language_loss": 0.75581932, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77871156, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.6146199703216553 + }, + { + "auxiliary_loss_clip": 0.01175561, + "auxiliary_loss_mlp": 0.01145619, + "balance_loss_clip": 1.00229156, + "balance_loss_mlp": 1.00094664, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.6790581141182175, + "language_loss": 0.74802363, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77123547, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.543511152267456 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01145708, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00084472, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.8203321429977948, + "language_loss": 0.82835507, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85123277, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.5898499488830566 + }, + { + "auxiliary_loss_clip": 0.01142235, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_clip": 1.00214171, + "balance_loss_mlp": 1.00107503, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 1.7793421067660453, + "language_loss": 0.73090279, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75378734, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.719428062438965 + }, + { + "auxiliary_loss_clip": 0.01126726, + "auxiliary_loss_mlp": 0.0114543, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00104427, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 2.552991387767401, + "language_loss": 0.71948618, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74220777, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.616783618927002 + }, + { + "auxiliary_loss_clip": 0.01143654, + "auxiliary_loss_mlp": 0.01146535, + "balance_loss_clip": 1.00228572, + "balance_loss_mlp": 1.00110006, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.278887684288252, + "language_loss": 0.74535799, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76825988, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.5415945053100586 + }, + { + "auxiliary_loss_clip": 0.01160026, + "auxiliary_loss_mlp": 0.01145787, + "balance_loss_clip": 1.00219488, + "balance_loss_mlp": 1.0010196, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 1.8204298201133398, + "language_loss": 0.85182202, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87488019, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.5936381816864014 + }, + { + "auxiliary_loss_clip": 0.01142513, + "auxiliary_loss_mlp": 0.01146957, + "balance_loss_clip": 1.00226712, + "balance_loss_mlp": 1.00104523, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8710132991822241, + "language_loss": 0.78927952, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.8121742, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.5997328758239746 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01145297, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.000911, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.5256039239787131, + "language_loss": 0.74946475, + "learning_rate": 3.531365436099496e-06, + "loss": 0.7720294, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.7326927185058594 + }, + { + "auxiliary_loss_clip": 0.01112003, + "auxiliary_loss_mlp": 0.01146708, + "balance_loss_clip": 1.00235438, + "balance_loss_mlp": 1.00108171, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.1116059773448517, + "language_loss": 0.79238528, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.8149724, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 2.643331289291382 + }, + { + "auxiliary_loss_clip": 0.01126739, + "auxiliary_loss_mlp": 0.01144953, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00094819, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.617954567829655, + "language_loss": 0.77150929, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79422629, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.6466445922851562 + }, + { + "auxiliary_loss_clip": 0.01159099, + "auxiliary_loss_mlp": 0.01146055, + "balance_loss_clip": 1.00218034, + "balance_loss_mlp": 1.001001, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 1.9461743988457927, + "language_loss": 0.810624, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83367556, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 4.109977960586548 + }, + { + "auxiliary_loss_clip": 0.0114442, + "auxiliary_loss_mlp": 0.01145741, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00106823, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.8317778684213062, + "language_loss": 0.73112041, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.754022, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.5555548667907715 + }, + { + "auxiliary_loss_clip": 0.01127546, + "auxiliary_loss_mlp": 0.01145869, + "balance_loss_clip": 1.00235295, + "balance_loss_mlp": 1.00100636, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.815109623575941, + "language_loss": 0.76651657, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78925073, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.6169252395629883 + }, + { + "auxiliary_loss_clip": 0.01144809, + "auxiliary_loss_mlp": 0.01146273, + "balance_loss_clip": 1.00216031, + "balance_loss_mlp": 1.00083768, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.9418383265262817, + "language_loss": 0.81309283, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83600366, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.580784797668457 + }, + { + "auxiliary_loss_clip": 0.01158883, + "auxiliary_loss_mlp": 0.01145871, + "balance_loss_clip": 1.00222766, + "balance_loss_mlp": 1.00091231, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.770155852895754, + "language_loss": 0.86918437, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89223194, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.5299065113067627 + }, + { + "auxiliary_loss_clip": 0.01096791, + "auxiliary_loss_mlp": 0.01135888, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00161111, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7581859649931371, + "language_loss": 0.57490891, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59723568, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.322659969329834 + }, + { + "auxiliary_loss_clip": 0.01142788, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_clip": 1.00396323, + "balance_loss_mlp": 1.00078583, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6420228477221611, + "language_loss": 0.5623585, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58512938, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.333807945251465 + }, + { + "auxiliary_loss_clip": 0.01147608, + "auxiliary_loss_mlp": 0.01145784, + "balance_loss_clip": 1.00256109, + "balance_loss_mlp": 1.000826, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.6286732947461264, + "language_loss": 0.77526772, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79820168, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.6902546882629395 + }, + { + "auxiliary_loss_clip": 0.01127963, + "auxiliary_loss_mlp": 0.01146237, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.0009923, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 1.8356497338511795, + "language_loss": 0.76129973, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.7840417, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 4.0531227588653564 + }, + { + "auxiliary_loss_clip": 0.0114306, + "auxiliary_loss_mlp": 0.01145513, + "balance_loss_clip": 1.00225174, + "balance_loss_mlp": 1.00103116, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 1.9653778209682664, + "language_loss": 0.67842817, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70131391, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 4.0675904750823975 + }, + { + "auxiliary_loss_clip": 0.01159052, + "auxiliary_loss_mlp": 0.01145523, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00085068, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.070664820159902, + "language_loss": 0.66558462, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68863034, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.6270511150360107 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.00416136, + "balance_loss_mlp": 1.00013304, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.709323671118566, + "language_loss": 0.61468983, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63745964, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.237135171890259 + }, + { + "auxiliary_loss_clip": 0.01175576, + "auxiliary_loss_mlp": 0.0114553, + "balance_loss_clip": 1.00237143, + "balance_loss_mlp": 1.00095296, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7877389922381517, + "language_loss": 0.73350656, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75671762, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.493818759918213 + }, + { + "auxiliary_loss_clip": 0.0114792, + "auxiliary_loss_mlp": 0.01145729, + "balance_loss_clip": 1.00240552, + "balance_loss_mlp": 1.0011518, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.1319579384778593, + "language_loss": 0.75434703, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.77728355, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.6385579109191895 + }, + { + "auxiliary_loss_clip": 0.0115898, + "auxiliary_loss_mlp": 0.01145787, + "balance_loss_clip": 1.00234652, + "balance_loss_mlp": 1.00101912, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.847197581720078, + "language_loss": 0.78365183, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80669951, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.5538582801818848 + }, + { + "auxiliary_loss_clip": 0.01158602, + "auxiliary_loss_mlp": 0.01144951, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00094593, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.8676931698271522, + "language_loss": 0.83803046, + "learning_rate": 3.526846877170133e-06, + "loss": 0.86106598, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.5558857917785645 + }, + { + "auxiliary_loss_clip": 0.01175533, + "auxiliary_loss_mlp": 0.01145365, + "balance_loss_clip": 1.00236917, + "balance_loss_mlp": 1.0011692, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7635473948558482, + "language_loss": 0.76154613, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78475511, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.5162131786346436 + }, + { + "auxiliary_loss_clip": 0.0114466, + "auxiliary_loss_mlp": 0.01145878, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00111032, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.216904438771443, + "language_loss": 0.72343516, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74634051, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.539957284927368 + }, + { + "auxiliary_loss_clip": 0.0117577, + "auxiliary_loss_mlp": 0.01146048, + "balance_loss_clip": 1.00251079, + "balance_loss_mlp": 1.00099444, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 1.539159324969369, + "language_loss": 0.65404403, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67726225, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.5594208240509033 + }, + { + "auxiliary_loss_clip": 0.01111192, + "auxiliary_loss_mlp": 0.01145473, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00108719, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.8024792385399233, + "language_loss": 0.72872609, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.75129277, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.7917988300323486 + }, + { + "auxiliary_loss_clip": 0.01126352, + "auxiliary_loss_mlp": 0.01145033, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00112343, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.9665512435379864, + "language_loss": 0.7933107, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81602454, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.6258344650268555 + }, + { + "auxiliary_loss_clip": 0.01143771, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_clip": 1.0022471, + "balance_loss_mlp": 1.00107741, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.4773833209586704, + "language_loss": 0.81136131, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83426034, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.598574161529541 + }, + { + "auxiliary_loss_clip": 0.01175557, + "auxiliary_loss_mlp": 0.01145322, + "balance_loss_clip": 1.0022558, + "balance_loss_mlp": 1.0010314, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 3.763507409879095, + "language_loss": 0.75266308, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77587187, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.5041205883026123 + }, + { + "auxiliary_loss_clip": 0.01126626, + "auxiliary_loss_mlp": 0.00748393, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00070763, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9043493829258855, + "language_loss": 0.82246673, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84121692, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.627035617828369 + }, + { + "auxiliary_loss_clip": 0.01175409, + "auxiliary_loss_mlp": 0.01145126, + "balance_loss_clip": 1.00223124, + "balance_loss_mlp": 1.00102544, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.0129111083978493, + "language_loss": 0.87138116, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89458656, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.478276491165161 + }, + { + "auxiliary_loss_clip": 0.01108995, + "auxiliary_loss_mlp": 0.01144997, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00099254, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.5417630030947163, + "language_loss": 0.75286692, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77540684, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.698796510696411 + }, + { + "auxiliary_loss_clip": 0.01094522, + "auxiliary_loss_mlp": 0.01133739, + "balance_loss_clip": 1.00319791, + "balance_loss_mlp": 1.00022495, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6693957835522483, + "language_loss": 0.58215171, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60443431, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.3287293910980225 + }, + { + "auxiliary_loss_clip": 0.01142553, + "auxiliary_loss_mlp": 0.01145044, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00094342, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.662959562711381, + "language_loss": 0.83566117, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85853708, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.6449317932128906 + }, + { + "auxiliary_loss_clip": 0.01141125, + "auxiliary_loss_mlp": 0.00747487, + "balance_loss_clip": 1.00293398, + "balance_loss_mlp": 1.00071287, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.901837385288623, + "language_loss": 0.63549733, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65438342, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.0133912563323975 + }, + { + "auxiliary_loss_clip": 0.01160088, + "auxiliary_loss_mlp": 0.01146014, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.00105536, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 2.1398550912276613, + "language_loss": 0.79059732, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8136583, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.549935817718506 + }, + { + "auxiliary_loss_clip": 0.0115873, + "auxiliary_loss_mlp": 0.01144667, + "balance_loss_clip": 1.00213253, + "balance_loss_mlp": 1.00104392, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.8746921777231427, + "language_loss": 0.74174142, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.7647754, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.542375326156616 + }, + { + "auxiliary_loss_clip": 0.01164527, + "auxiliary_loss_mlp": 0.01145107, + "balance_loss_clip": 1.00270224, + "balance_loss_mlp": 1.00100696, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.468665975591148, + "language_loss": 0.88531023, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90840662, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.5115468502044678 + }, + { + "auxiliary_loss_clip": 0.01175655, + "auxiliary_loss_mlp": 0.01145476, + "balance_loss_clip": 1.00238323, + "balance_loss_mlp": 1.00070798, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.0397196703553178, + "language_loss": 0.70037389, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.72358525, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.508061647415161 + }, + { + "auxiliary_loss_clip": 0.01175448, + "auxiliary_loss_mlp": 0.01144735, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00082505, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.147241024939867, + "language_loss": 0.80106211, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82426393, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01110569, + "auxiliary_loss_mlp": 0.01145085, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00117564, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 1.8912884534366174, + "language_loss": 0.75238037, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77493691, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.663172960281372 + }, + { + "auxiliary_loss_clip": 0.01160049, + "auxiliary_loss_mlp": 0.01144658, + "balance_loss_clip": 1.00229669, + "balance_loss_mlp": 1.0007484, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.373975437229366, + "language_loss": 0.732611, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75565803, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.7159035205841064 + }, + { + "auxiliary_loss_clip": 0.01126493, + "auxiliary_loss_mlp": 0.00748491, + "balance_loss_clip": 1.00199604, + "balance_loss_mlp": 1.00090313, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.073306790819694, + "language_loss": 0.69419742, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71294731, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.6159942150115967 + }, + { + "auxiliary_loss_clip": 0.01159817, + "auxiliary_loss_mlp": 0.01144855, + "balance_loss_clip": 1.00215602, + "balance_loss_mlp": 1.00085032, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.0640902613946546, + "language_loss": 0.81261247, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83565915, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 2.5446581840515137 + }, + { + "auxiliary_loss_clip": 0.01164639, + "auxiliary_loss_mlp": 0.00748378, + "balance_loss_clip": 1.00269854, + "balance_loss_mlp": 1.00075579, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.2440527191923647, + "language_loss": 0.83809131, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.85722148, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 3.9757423400878906 + }, + { + "auxiliary_loss_clip": 0.01143382, + "auxiliary_loss_mlp": 0.01145815, + "balance_loss_clip": 1.00225985, + "balance_loss_mlp": 1.00104761, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.4223256611833603, + "language_loss": 0.65494281, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67783481, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.6141207218170166 + }, + { + "auxiliary_loss_clip": 0.01115701, + "auxiliary_loss_mlp": 0.0114519, + "balance_loss_clip": 1.00242531, + "balance_loss_mlp": 1.00089884, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.8903189709007484, + "language_loss": 0.75484061, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77744955, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.6928458213806152 + }, + { + "auxiliary_loss_clip": 0.01094151, + "auxiliary_loss_mlp": 0.01145492, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00120091, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.1085841737584103, + "language_loss": 0.76988733, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79228377, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.6406679153442383 + }, + { + "auxiliary_loss_clip": 0.01158543, + "auxiliary_loss_mlp": 0.01144441, + "balance_loss_clip": 1.00220966, + "balance_loss_mlp": 1.00091338, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.8694298938369889, + "language_loss": 0.83901858, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86204839, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.6051506996154785 + }, + { + "auxiliary_loss_clip": 0.01143259, + "auxiliary_loss_mlp": 0.01145265, + "balance_loss_clip": 1.00215757, + "balance_loss_mlp": 1.00087881, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.609216265697087, + "language_loss": 0.71392417, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73680949, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.532907485961914 + }, + { + "auxiliary_loss_clip": 0.01175541, + "auxiliary_loss_mlp": 0.01146331, + "balance_loss_clip": 1.002285, + "balance_loss_mlp": 1.00070524, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.47741976272399, + "language_loss": 0.61177921, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.6349979, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 2.497753620147705 + }, + { + "auxiliary_loss_clip": 0.01164553, + "auxiliary_loss_mlp": 0.01145215, + "balance_loss_clip": 1.00272918, + "balance_loss_mlp": 1.00063848, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 1.9478162547898668, + "language_loss": 0.78496319, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80806088, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 3.9172093868255615 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01145283, + "balance_loss_clip": 1.00228119, + "balance_loss_mlp": 1.00070632, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.184149689949123, + "language_loss": 0.82708925, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84996611, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 3.9564762115478516 + }, + { + "auxiliary_loss_clip": 0.01126749, + "auxiliary_loss_mlp": 0.01145281, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00079894, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.9353380079129257, + "language_loss": 0.71021444, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73293471, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 4.135396242141724 + }, + { + "auxiliary_loss_clip": 0.01159631, + "auxiliary_loss_mlp": 0.00748389, + "balance_loss_clip": 1.0021553, + "balance_loss_mlp": 1.0007143, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 1.8350204216203638, + "language_loss": 0.66845977, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68753994, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.5603256225585938 + }, + { + "auxiliary_loss_clip": 0.01125941, + "auxiliary_loss_mlp": 0.01144781, + "balance_loss_clip": 1.00208867, + "balance_loss_mlp": 1.00077653, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 1.8702886329056003, + "language_loss": 0.83859277, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86129999, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.7102248668670654 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01145654, + "balance_loss_clip": 1.00245833, + "balance_loss_mlp": 1.00117207, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.480520879202107, + "language_loss": 0.78838205, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81115997, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.6604840755462646 + }, + { + "auxiliary_loss_clip": 0.01156677, + "auxiliary_loss_mlp": 0.01135298, + "balance_loss_clip": 1.00238347, + "balance_loss_mlp": 1.00102031, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8169332016286694, + "language_loss": 0.61033666, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63325638, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.1909120082855225 + }, + { + "auxiliary_loss_clip": 0.01175383, + "auxiliary_loss_mlp": 0.01146083, + "balance_loss_clip": 1.00217581, + "balance_loss_mlp": 1.00122011, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 1.8466218099957723, + "language_loss": 0.73427999, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75749469, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.645644187927246 + }, + { + "auxiliary_loss_clip": 0.01159853, + "auxiliary_loss_mlp": 0.01145614, + "balance_loss_clip": 1.00220764, + "balance_loss_mlp": 1.00103712, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.8385415981023374, + "language_loss": 0.80528617, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.82834089, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.616877794265747 + }, + { + "auxiliary_loss_clip": 0.01141826, + "auxiliary_loss_mlp": 0.01144752, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00093782, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 2.421857636121437, + "language_loss": 0.58666766, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.60953343, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.6969971656799316 + }, + { + "auxiliary_loss_clip": 0.01159819, + "auxiliary_loss_mlp": 0.01145148, + "balance_loss_clip": 1.00219154, + "balance_loss_mlp": 1.00114369, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 1.887837664055232, + "language_loss": 0.79030967, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81335932, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.6255407333374023 + }, + { + "auxiliary_loss_clip": 0.01158703, + "auxiliary_loss_mlp": 0.0074841, + "balance_loss_clip": 1.00211871, + "balance_loss_mlp": 1.00060046, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 1.9160276897647914, + "language_loss": 0.65327621, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67234731, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.5265276432037354 + }, + { + "auxiliary_loss_clip": 0.01141871, + "auxiliary_loss_mlp": 0.011338, + "balance_loss_clip": 1.00339079, + "balance_loss_mlp": 1.00028598, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9373619968993152, + "language_loss": 0.67264551, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.6954022, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.2817773818969727 + }, + { + "auxiliary_loss_clip": 0.01143379, + "auxiliary_loss_mlp": 0.01144616, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00099313, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 11.027367007323498, + "language_loss": 0.89293098, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91581088, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.6272103786468506 + }, + { + "auxiliary_loss_clip": 0.01110492, + "auxiliary_loss_mlp": 0.0114637, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00112557, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8420143486473626, + "language_loss": 0.68461043, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70717907, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.6963253021240234 + }, + { + "auxiliary_loss_clip": 0.01159664, + "auxiliary_loss_mlp": 0.01145391, + "balance_loss_clip": 1.00236714, + "balance_loss_mlp": 1.00100446, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.6552669841366838, + "language_loss": 0.71613514, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73918563, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.5865116119384766 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01145283, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00099182, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.7669152315242878, + "language_loss": 0.73001754, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75258684, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.6319870948791504 + }, + { + "auxiliary_loss_clip": 0.01160111, + "auxiliary_loss_mlp": 0.01145844, + "balance_loss_clip": 1.00219333, + "balance_loss_mlp": 1.00107646, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 2.645184919804228, + "language_loss": 0.6333245, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65638399, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.545598268508911 + }, + { + "auxiliary_loss_clip": 0.01158848, + "auxiliary_loss_mlp": 0.01145217, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00111675, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.0937874425556453, + "language_loss": 0.76914001, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79218066, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.586076259613037 + }, + { + "auxiliary_loss_clip": 0.0115933, + "auxiliary_loss_mlp": 0.01145168, + "balance_loss_clip": 1.00217211, + "balance_loss_mlp": 1.00106823, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 1.8801164173129068, + "language_loss": 0.76266593, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78571093, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.5386228561401367 + }, + { + "auxiliary_loss_clip": 0.01158841, + "auxiliary_loss_mlp": 0.01146037, + "balance_loss_clip": 1.00218642, + "balance_loss_mlp": 1.00117373, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.3128823868310433, + "language_loss": 0.70722383, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.73027259, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.01142361, + "auxiliary_loss_mlp": 0.01145749, + "balance_loss_clip": 1.00206137, + "balance_loss_mlp": 1.00107622, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.5742449769331337, + "language_loss": 0.74801326, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77089441, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.5669403076171875 + }, + { + "auxiliary_loss_clip": 0.01158919, + "auxiliary_loss_mlp": 0.01145465, + "balance_loss_clip": 1.00213134, + "balance_loss_mlp": 1.00117421, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 3.9109797163822284, + "language_loss": 0.77017391, + "learning_rate": 3.513688085236591e-06, + "loss": 0.79321778, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.540329933166504 + }, + { + "auxiliary_loss_clip": 0.01094841, + "auxiliary_loss_mlp": 0.01145418, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00112748, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6783048584582252, + "language_loss": 0.81510216, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83750474, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.6776909828186035 + }, + { + "auxiliary_loss_clip": 0.01143324, + "auxiliary_loss_mlp": 0.01144818, + "balance_loss_clip": 1.00213683, + "balance_loss_mlp": 1.00081325, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.424695595282635, + "language_loss": 0.76063025, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.78351164, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.551100969314575 + }, + { + "auxiliary_loss_clip": 0.01164376, + "auxiliary_loss_mlp": 0.01145454, + "balance_loss_clip": 1.00243437, + "balance_loss_mlp": 1.00097203, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.0710241629730444, + "language_loss": 0.71430242, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73740077, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.5440011024475098 + }, + { + "auxiliary_loss_clip": 0.01174603, + "auxiliary_loss_mlp": 0.01133547, + "balance_loss_clip": 1.00374043, + "balance_loss_mlp": 1.0000329, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7503430304372156, + "language_loss": 0.5675739, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.59065545, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.145231246948242 + }, + { + "auxiliary_loss_clip": 0.01158808, + "auxiliary_loss_mlp": 0.01146387, + "balance_loss_clip": 1.00215411, + "balance_loss_mlp": 1.00104702, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 2.5699507556457526, + "language_loss": 0.81213248, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.83518445, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.5627870559692383 + }, + { + "auxiliary_loss_clip": 0.01143125, + "auxiliary_loss_mlp": 0.00748436, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00076318, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.510037815470572, + "language_loss": 0.87214607, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89106166, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.5543696880340576 + }, + { + "auxiliary_loss_clip": 0.01159052, + "auxiliary_loss_mlp": 0.01146053, + "balance_loss_clip": 1.0022645, + "balance_loss_mlp": 1.00090361, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.754743906805311, + "language_loss": 0.83163249, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85468364, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.5489792823791504 + }, + { + "auxiliary_loss_clip": 0.01158852, + "auxiliary_loss_mlp": 0.01145084, + "balance_loss_clip": 1.00227809, + "balance_loss_mlp": 1.00117421, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.8642766997949174, + "language_loss": 0.74021965, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76325899, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 2.562246084213257 + }, + { + "auxiliary_loss_clip": 0.01126026, + "auxiliary_loss_mlp": 0.01145535, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00105369, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.7739226346509829, + "language_loss": 0.74079061, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76350629, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 4.032481908798218 + }, + { + "auxiliary_loss_clip": 0.01125632, + "auxiliary_loss_mlp": 0.01144464, + "balance_loss_clip": 1.00190485, + "balance_loss_mlp": 1.0010314, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.6759641403525665, + "language_loss": 0.81391448, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.83661544, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.6488499641418457 + }, + { + "auxiliary_loss_clip": 0.0115883, + "auxiliary_loss_mlp": 0.01144729, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00091457, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.669806465091912, + "language_loss": 0.79650533, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81954086, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.539064645767212 + }, + { + "auxiliary_loss_clip": 0.01158709, + "auxiliary_loss_mlp": 0.01146227, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00098205, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.4105048574205035, + "language_loss": 0.70254159, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72559094, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.768798828125 + }, + { + "auxiliary_loss_clip": 0.01131843, + "auxiliary_loss_mlp": 0.01144762, + "balance_loss_clip": 1.00229967, + "balance_loss_mlp": 1.00094783, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 2.275658113763451, + "language_loss": 0.77634418, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79911029, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.6707305908203125 + }, + { + "auxiliary_loss_clip": 0.01142397, + "auxiliary_loss_mlp": 0.01144989, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00088906, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.920115156256351, + "language_loss": 0.75791931, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78079319, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.6130521297454834 + }, + { + "auxiliary_loss_clip": 0.01174598, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.00364971, + "balance_loss_mlp": 0.99998873, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8332999854292248, + "language_loss": 0.60083282, + "learning_rate": 3.509863377145458e-06, + "loss": 0.6239062, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.142822504043579 + }, + { + "auxiliary_loss_clip": 0.01143571, + "auxiliary_loss_mlp": 0.01145007, + "balance_loss_clip": 1.00208783, + "balance_loss_mlp": 1.00100243, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.7471570837233756, + "language_loss": 0.79144377, + "learning_rate": 3.509607938211409e-06, + "loss": 0.8143295, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 4.075446844100952 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01145371, + "balance_loss_clip": 1.00226092, + "balance_loss_mlp": 1.00107968, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.035966477396211, + "language_loss": 0.83076811, + "learning_rate": 3.509352442032875e-06, + "loss": 0.8539753, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 3.9338455200195312 + }, + { + "auxiliary_loss_clip": 0.01111293, + "auxiliary_loss_mlp": 0.01145409, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00083208, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0240536018270032, + "language_loss": 0.7118246, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73439163, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.657116413116455 + }, + { + "auxiliary_loss_clip": 0.01127488, + "auxiliary_loss_mlp": 0.01144736, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00082695, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 1.8336962958507197, + "language_loss": 0.80547023, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82819247, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.643864393234253 + }, + { + "auxiliary_loss_clip": 0.01142776, + "auxiliary_loss_mlp": 0.01144597, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00068748, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.0851882012888034, + "language_loss": 0.82767564, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8505494, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.5656628608703613 + }, + { + "auxiliary_loss_clip": 0.01126518, + "auxiliary_loss_mlp": 0.01145286, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00080419, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.4930668999235532, + "language_loss": 0.82936156, + "learning_rate": 3.508329885067698e-06, + "loss": 0.85207963, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.6067988872528076 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.00748373, + "balance_loss_clip": 1.00228059, + "balance_loss_mlp": 1.00096846, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 4.260467863679497, + "language_loss": 0.75465047, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77388692, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.4829788208007812 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01145909, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00142741, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 1.755404851490646, + "language_loss": 0.70155525, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72427469, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.5966436862945557 + }, + { + "auxiliary_loss_clip": 0.01175241, + "auxiliary_loss_mlp": 0.01145444, + "balance_loss_clip": 1.00229955, + "balance_loss_mlp": 1.00105715, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0740472054219308, + "language_loss": 0.85882765, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88203454, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.4984238147735596 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01145077, + "balance_loss_clip": 1.0023582, + "balance_loss_mlp": 1.00126278, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 2.2596959245297548, + "language_loss": 0.68390274, + "learning_rate": 3.507306412966238e-06, + "loss": 0.7071079, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.632666826248169 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01133979, + "balance_loss_clip": 1.00350118, + "balance_loss_mlp": 1.00046444, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8448130316932263, + "language_loss": 0.70098019, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72375679, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.22584867477417 + }, + { + "auxiliary_loss_clip": 0.01144375, + "auxiliary_loss_mlp": 0.01145302, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00082016, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.67803427778695, + "language_loss": 0.74163216, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76452893, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.6440036296844482 + }, + { + "auxiliary_loss_clip": 0.01158475, + "auxiliary_loss_mlp": 0.01144846, + "balance_loss_clip": 1.00218952, + "balance_loss_mlp": 1.00103235, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 2.6743260235233857, + "language_loss": 0.8283937, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85142696, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.5905823707580566 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01134167, + "balance_loss_clip": 1.00235629, + "balance_loss_mlp": 1.00065219, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7821626617731406, + "language_loss": 0.61565638, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63805819, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.121860980987549 + }, + { + "auxiliary_loss_clip": 0.01125097, + "auxiliary_loss_mlp": 0.01145548, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00097072, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.9124489806733351, + "language_loss": 0.78847229, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81117868, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.613908529281616 + }, + { + "auxiliary_loss_clip": 0.01092701, + "auxiliary_loss_mlp": 0.0114496, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00114572, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5538408058359847, + "language_loss": 0.80011833, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82249498, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.6768202781677246 + }, + { + "auxiliary_loss_clip": 0.01158612, + "auxiliary_loss_mlp": 0.01145798, + "balance_loss_clip": 1.00221467, + "balance_loss_mlp": 1.00122118, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 2.0994449718131754, + "language_loss": 0.74461341, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76765752, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.5761475563049316 + }, + { + "auxiliary_loss_clip": 0.01143096, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00088322, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 3.8633701835699386, + "language_loss": 0.84691548, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.8697868, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.6104750633239746 + }, + { + "auxiliary_loss_clip": 0.01143997, + "auxiliary_loss_mlp": 0.01144678, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00086355, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 2.2533225650853796, + "language_loss": 0.74993747, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77282417, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.6760871410369873 + }, + { + "auxiliary_loss_clip": 0.01157506, + "auxiliary_loss_mlp": 0.01133171, + "balance_loss_clip": 1.00323606, + "balance_loss_mlp": 1.00041986, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7144036968992178, + "language_loss": 0.57469296, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59759974, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.227587938308716 + }, + { + "auxiliary_loss_clip": 0.01142143, + "auxiliary_loss_mlp": 0.01145137, + "balance_loss_clip": 1.00229096, + "balance_loss_mlp": 1.00094175, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 1.776915366898499, + "language_loss": 0.75782186, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78069466, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.6073644161224365 + }, + { + "auxiliary_loss_clip": 0.01158558, + "auxiliary_loss_mlp": 0.01144829, + "balance_loss_clip": 1.00211835, + "balance_loss_mlp": 1.00111032, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 3.6146846750603325, + "language_loss": 0.8444463, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86748016, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.521087884902954 + }, + { + "auxiliary_loss_clip": 0.01175326, + "auxiliary_loss_mlp": 0.01144682, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00124907, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.458447515754927, + "language_loss": 0.88378108, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90698111, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.560417890548706 + }, + { + "auxiliary_loss_clip": 0.01175245, + "auxiliary_loss_mlp": 0.01145369, + "balance_loss_clip": 1.00224149, + "balance_loss_mlp": 1.00088716, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.1972042642575986, + "language_loss": 0.85699248, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88019866, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.5155935287475586 + }, + { + "auxiliary_loss_clip": 0.01158558, + "auxiliary_loss_mlp": 0.01145372, + "balance_loss_clip": 1.00220442, + "balance_loss_mlp": 1.00108075, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.683528626874389, + "language_loss": 0.83102226, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85406148, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.5805625915527344 + }, + { + "auxiliary_loss_clip": 0.01158615, + "auxiliary_loss_mlp": 0.01145338, + "balance_loss_clip": 1.00226057, + "balance_loss_mlp": 1.00114226, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.1638486522906692, + "language_loss": 0.72640598, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.7494455, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.703644037246704 + }, + { + "auxiliary_loss_clip": 0.01175361, + "auxiliary_loss_mlp": 0.01145392, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.0011965, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.6945873533829918, + "language_loss": 0.76827383, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79148138, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.4863438606262207 + }, + { + "auxiliary_loss_clip": 0.01141985, + "auxiliary_loss_mlp": 0.0074841, + "balance_loss_clip": 1.00214934, + "balance_loss_mlp": 1.00080085, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 2.3474525441705802, + "language_loss": 0.73240006, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75130403, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 2.7180607318878174 + }, + { + "auxiliary_loss_clip": 0.01158499, + "auxiliary_loss_mlp": 0.01144791, + "balance_loss_clip": 1.00206959, + "balance_loss_mlp": 1.00126278, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.5946715801181321, + "language_loss": 0.82407367, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84710664, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.746678113937378 + }, + { + "auxiliary_loss_clip": 0.01108742, + "auxiliary_loss_mlp": 0.01145897, + "balance_loss_clip": 1.00203073, + "balance_loss_mlp": 1.00103402, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 2.3080162038799457, + "language_loss": 0.74901235, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77155876, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.651346206665039 + }, + { + "auxiliary_loss_clip": 0.01158544, + "auxiliary_loss_mlp": 0.01144609, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00117636, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.7862577900195487, + "language_loss": 0.7319839, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75501537, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 3.9455666542053223 + }, + { + "auxiliary_loss_clip": 0.01143156, + "auxiliary_loss_mlp": 0.01144868, + "balance_loss_clip": 1.00212801, + "balance_loss_mlp": 1.00086308, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.8385259216094103, + "language_loss": 0.77581942, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79869962, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 2.6294708251953125 + }, + { + "auxiliary_loss_clip": 0.01125924, + "auxiliary_loss_mlp": 0.01144743, + "balance_loss_clip": 1.00183809, + "balance_loss_mlp": 1.0012151, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.952420291339099, + "language_loss": 0.7260077, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74871433, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.577141046524048 + }, + { + "auxiliary_loss_clip": 0.01159982, + "auxiliary_loss_mlp": 0.01143551, + "balance_loss_clip": 1.00229657, + "balance_loss_mlp": 1.0009768, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 2.145204086829742, + "language_loss": 0.75588346, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77891874, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.7710931301116943 + }, + { + "auxiliary_loss_clip": 0.01126294, + "auxiliary_loss_mlp": 0.01144377, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.0009439, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.6767009719891206, + "language_loss": 0.792512, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81521869, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.6128711700439453 + }, + { + "auxiliary_loss_clip": 0.0115847, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_clip": 1.00229561, + "balance_loss_mlp": 1.00117934, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.622760181810848, + "language_loss": 0.76157957, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78461325, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.5665552616119385 + }, + { + "auxiliary_loss_clip": 0.01158331, + "auxiliary_loss_mlp": 0.01144359, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00083077, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.7955315979144053, + "language_loss": 0.69838828, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72141522, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.5871317386627197 + }, + { + "auxiliary_loss_clip": 0.01159511, + "auxiliary_loss_mlp": 0.01132769, + "balance_loss_clip": 1.00424314, + "balance_loss_mlp": 1.000018, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.769062140203458, + "language_loss": 0.55114222, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57406497, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.1885547637939453 + }, + { + "auxiliary_loss_clip": 0.01125212, + "auxiliary_loss_mlp": 0.01144148, + "balance_loss_clip": 1.00190675, + "balance_loss_mlp": 1.00081086, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.7898639524116184, + "language_loss": 0.80161834, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82431191, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 3.98759126663208 + }, + { + "auxiliary_loss_clip": 0.01109937, + "auxiliary_loss_mlp": 0.01144434, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00109696, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 2.1050661772895554, + "language_loss": 0.78304672, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80559039, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 5.556978464126587 + }, + { + "auxiliary_loss_clip": 0.01159337, + "auxiliary_loss_mlp": 0.01144551, + "balance_loss_clip": 1.00216579, + "balance_loss_mlp": 1.00102305, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.085894940535105, + "language_loss": 0.53720045, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.56023932, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.652350425720215 + }, + { + "auxiliary_loss_clip": 0.01143213, + "auxiliary_loss_mlp": 0.01145153, + "balance_loss_clip": 1.00214922, + "balance_loss_mlp": 1.00105309, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.510415104975358, + "language_loss": 0.65142936, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67431295, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.5765926837921143 + }, + { + "auxiliary_loss_clip": 0.01160015, + "auxiliary_loss_mlp": 0.01132068, + "balance_loss_clip": 1.00421882, + "balance_loss_mlp": 1.00007904, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8473006104140558, + "language_loss": 0.58027542, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60319614, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.8771233558654785 + }, + { + "auxiliary_loss_clip": 0.01142228, + "auxiliary_loss_mlp": 0.01144358, + "balance_loss_clip": 1.0022589, + "balance_loss_mlp": 1.00083053, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.6215834265231115, + "language_loss": 0.83469307, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85755891, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.752845048904419 + }, + { + "auxiliary_loss_clip": 0.01158641, + "auxiliary_loss_mlp": 0.01144496, + "balance_loss_clip": 1.00202549, + "balance_loss_mlp": 1.00106335, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.752305328498174, + "language_loss": 0.80566645, + "learning_rate": 3.498312090875666e-06, + "loss": 0.8286978, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.54378342628479 + }, + { + "auxiliary_loss_clip": 0.01144298, + "auxiliary_loss_mlp": 0.01144325, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00089264, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.2140294159367158, + "language_loss": 0.74823624, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77112246, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.5686628818511963 + }, + { + "auxiliary_loss_clip": 0.01159519, + "auxiliary_loss_mlp": 0.01144321, + "balance_loss_clip": 1.00231123, + "balance_loss_mlp": 1.00098336, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.7039077144865094, + "language_loss": 0.74585998, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76889837, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.5623137950897217 + }, + { + "auxiliary_loss_clip": 0.01164288, + "auxiliary_loss_mlp": 0.01144782, + "balance_loss_clip": 1.0023458, + "balance_loss_mlp": 1.00115871, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.9413038527735138, + "language_loss": 0.81390035, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83699101, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.5042943954467773 + }, + { + "auxiliary_loss_clip": 0.01111965, + "auxiliary_loss_mlp": 0.01144961, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.00086129, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.3038694923244907, + "language_loss": 0.70786822, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73043746, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.688446283340454 + }, + { + "auxiliary_loss_clip": 0.01175345, + "auxiliary_loss_mlp": 0.01144343, + "balance_loss_clip": 1.00233054, + "balance_loss_mlp": 1.00100577, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.8847856842836093, + "language_loss": 0.61802554, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64122248, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.4759607315063477 + }, + { + "auxiliary_loss_clip": 0.0115859, + "auxiliary_loss_mlp": 0.01145509, + "balance_loss_clip": 1.00215745, + "balance_loss_mlp": 1.00159931, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.7199730140930891, + "language_loss": 0.74665892, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76969993, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.537010431289673 + }, + { + "auxiliary_loss_clip": 0.01109146, + "auxiliary_loss_mlp": 0.01143491, + "balance_loss_clip": 1.00203598, + "balance_loss_mlp": 1.00072634, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 2.543190230779828, + "language_loss": 0.80105239, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82357872, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.653366804122925 + }, + { + "auxiliary_loss_clip": 0.01158453, + "auxiliary_loss_mlp": 0.00748277, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00060105, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.724421540018834, + "language_loss": 0.77860779, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79767513, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.5699312686920166 + }, + { + "auxiliary_loss_clip": 0.01158389, + "auxiliary_loss_mlp": 0.01144483, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.0011456, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6164119581662084, + "language_loss": 0.84806645, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.87109518, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.5790529251098633 + }, + { + "auxiliary_loss_clip": 0.01174978, + "auxiliary_loss_mlp": 0.0114383, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00106549, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.4571461062897557, + "language_loss": 0.71234059, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73552871, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.5690011978149414 + }, + { + "auxiliary_loss_clip": 0.0117455, + "auxiliary_loss_mlp": 0.01131958, + "balance_loss_clip": 1.00387371, + "balance_loss_mlp": 0.99996918, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.987269587586096, + "language_loss": 0.61882818, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64189327, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 3.0020716190338135 + }, + { + "auxiliary_loss_clip": 0.01159955, + "auxiliary_loss_mlp": 0.01144581, + "balance_loss_clip": 1.00222385, + "balance_loss_mlp": 1.00105333, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.186971203781837, + "language_loss": 0.8611064, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88415182, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.513309955596924 + }, + { + "auxiliary_loss_clip": 0.01126629, + "auxiliary_loss_mlp": 0.01144602, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00088358, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.882778854180741, + "language_loss": 0.77528018, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79799247, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.625060558319092 + }, + { + "auxiliary_loss_clip": 0.01158355, + "auxiliary_loss_mlp": 0.01144327, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00098932, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 2.599275986790391, + "language_loss": 0.75642097, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77944773, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.527564764022827 + }, + { + "auxiliary_loss_clip": 0.01164171, + "auxiliary_loss_mlp": 0.01144288, + "balance_loss_clip": 1.00233626, + "balance_loss_mlp": 1.00085497, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.6912538966605597, + "language_loss": 0.74084508, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76392972, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.522799253463745 + }, + { + "auxiliary_loss_clip": 0.01175258, + "auxiliary_loss_mlp": 0.01144422, + "balance_loss_clip": 1.00240684, + "balance_loss_mlp": 1.00098979, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.9671096268824393, + "language_loss": 0.86392331, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88712013, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.5254032611846924 + }, + { + "auxiliary_loss_clip": 0.01095124, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_clip": 1.00179398, + "balance_loss_mlp": 1.00083911, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.493008667710455, + "language_loss": 0.75254512, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77492952, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.7250986099243164 + }, + { + "auxiliary_loss_clip": 0.01141977, + "auxiliary_loss_mlp": 0.01144674, + "balance_loss_clip": 1.00208342, + "balance_loss_mlp": 1.00105071, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.3890608548098193, + "language_loss": 0.75228941, + "learning_rate": 3.493659311850379e-06, + "loss": 0.7751559, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.60756516456604 + }, + { + "auxiliary_loss_clip": 0.01128773, + "auxiliary_loss_mlp": 0.00748305, + "balance_loss_clip": 1.00222635, + "balance_loss_mlp": 1.00072932, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 1.8377359047038857, + "language_loss": 0.64852059, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6672914, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.672987937927246 + }, + { + "auxiliary_loss_clip": 0.0117508, + "auxiliary_loss_mlp": 0.01144012, + "balance_loss_clip": 1.00234425, + "balance_loss_mlp": 1.00067437, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 2.4500346839492577, + "language_loss": 0.67598832, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69917923, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.491429090499878 + }, + { + "auxiliary_loss_clip": 0.01175141, + "auxiliary_loss_mlp": 0.01144448, + "balance_loss_clip": 1.00230169, + "balance_loss_mlp": 1.00092006, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.8967326609941775, + "language_loss": 0.75295573, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77615166, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.4969451427459717 + }, + { + "auxiliary_loss_clip": 0.01158476, + "auxiliary_loss_mlp": 0.01144459, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00083518, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8907755881409434, + "language_loss": 0.80252349, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82555282, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.676217794418335 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01144527, + "balance_loss_clip": 1.00216627, + "balance_loss_mlp": 1.00099909, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.684520479156328, + "language_loss": 0.77049065, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79352123, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.5590944290161133 + }, + { + "auxiliary_loss_clip": 0.01175226, + "auxiliary_loss_mlp": 0.01144696, + "balance_loss_clip": 1.00227141, + "balance_loss_mlp": 1.00069106, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 3.632869922242086, + "language_loss": 0.83583409, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85903335, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 3.9492955207824707 + }, + { + "auxiliary_loss_clip": 0.01158646, + "auxiliary_loss_mlp": 0.01144259, + "balance_loss_clip": 1.00231361, + "balance_loss_mlp": 1.00101721, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.5853344994352438, + "language_loss": 0.73692238, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75995141, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.572450876235962 + }, + { + "auxiliary_loss_clip": 0.0117516, + "auxiliary_loss_mlp": 0.00748257, + "balance_loss_clip": 1.00228071, + "balance_loss_mlp": 1.00069249, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.522053057595242, + "language_loss": 0.72888052, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74811471, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.527631998062134 + }, + { + "auxiliary_loss_clip": 0.01157927, + "auxiliary_loss_mlp": 0.01144473, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.00113559, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.7268734450001508, + "language_loss": 0.81824028, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84126425, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.5320258140563965 + }, + { + "auxiliary_loss_clip": 0.0115753, + "auxiliary_loss_mlp": 0.01132109, + "balance_loss_clip": 1.0036397, + "balance_loss_mlp": 1.00012016, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6879435485880383, + "language_loss": 0.57662666, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59952295, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.2717788219451904 + }, + { + "auxiliary_loss_clip": 0.01175102, + "auxiliary_loss_mlp": 0.01144113, + "balance_loss_clip": 1.00218821, + "balance_loss_mlp": 1.00096667, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.1130080670811076, + "language_loss": 0.65775925, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.68095142, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.5001935958862305 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01143437, + "balance_loss_clip": 1.00224614, + "balance_loss_mlp": 1.00076711, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.76971941636297, + "language_loss": 0.81849658, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.84152603, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.541853666305542 + }, + { + "auxiliary_loss_clip": 0.01158535, + "auxiliary_loss_mlp": 0.011449, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00079966, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 10.438945641192179, + "language_loss": 0.83712983, + "learning_rate": 3.490287555252514e-06, + "loss": 0.86016417, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.5026626586914062 + }, + { + "auxiliary_loss_clip": 0.01144201, + "auxiliary_loss_mlp": 0.0114422, + "balance_loss_clip": 1.0021528, + "balance_loss_mlp": 1.00097835, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 1.930907679844514, + "language_loss": 0.8429606, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86584473, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 5.388784646987915 + }, + { + "auxiliary_loss_clip": 0.01096532, + "auxiliary_loss_mlp": 0.01131305, + "balance_loss_clip": 1.00272417, + "balance_loss_mlp": 1.00007975, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7491627970446447, + "language_loss": 0.56251669, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58479512, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 4.6662116050720215 + }, + { + "auxiliary_loss_clip": 0.01141533, + "auxiliary_loss_mlp": 0.01144008, + "balance_loss_clip": 1.00200665, + "balance_loss_mlp": 1.00067031, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1004415838904062, + "language_loss": 0.80754018, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.83039558, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.596843957901001 + }, + { + "auxiliary_loss_clip": 0.01132677, + "auxiliary_loss_mlp": 0.01131309, + "balance_loss_clip": 1.00380969, + "balance_loss_mlp": 1.00008345, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7871376179144672, + "language_loss": 0.66079277, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68343258, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.2069132328033447 + }, + { + "auxiliary_loss_clip": 0.01157802, + "auxiliary_loss_mlp": 0.01143456, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00069094, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 1.9665485811256223, + "language_loss": 0.73276675, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75577933, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.573411464691162 + }, + { + "auxiliary_loss_clip": 0.01133445, + "auxiliary_loss_mlp": 0.0114376, + "balance_loss_clip": 1.00225306, + "balance_loss_mlp": 1.00089931, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 2.2641973200521277, + "language_loss": 0.73100853, + "learning_rate": 3.488728137415357e-06, + "loss": 0.7537806, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.6147472858428955 + }, + { + "auxiliary_loss_clip": 0.01111815, + "auxiliary_loss_mlp": 0.00748302, + "balance_loss_clip": 1.00197399, + "balance_loss_mlp": 1.00076342, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.6336153672316311, + "language_loss": 0.80713046, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82573164, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.6497597694396973 + }, + { + "auxiliary_loss_clip": 0.01143015, + "auxiliary_loss_mlp": 0.01143321, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.00084233, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.6220288008177979, + "language_loss": 0.85448605, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87734944, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.613570213317871 + }, + { + "auxiliary_loss_clip": 0.01127869, + "auxiliary_loss_mlp": 0.01145015, + "balance_loss_clip": 1.00216722, + "balance_loss_mlp": 1.00100982, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 2.638297470843822, + "language_loss": 0.74526995, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76799881, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.6161906719207764 + }, + { + "auxiliary_loss_clip": 0.01111726, + "auxiliary_loss_mlp": 0.01133301, + "balance_loss_clip": 1.00233293, + "balance_loss_mlp": 1.00131214, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.7909688557361972, + "language_loss": 0.6522662, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67471641, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.192168712615967 + }, + { + "auxiliary_loss_clip": 0.01109192, + "auxiliary_loss_mlp": 0.00748111, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00072527, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.7995031907420314, + "language_loss": 0.77022874, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78880173, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.694183111190796 + }, + { + "auxiliary_loss_clip": 0.0112632, + "auxiliary_loss_mlp": 0.01131303, + "balance_loss_clip": 1.00335932, + "balance_loss_mlp": 1.00007772, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7856705717557665, + "language_loss": 0.58461958, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60719579, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.2730517387390137 + }, + { + "auxiliary_loss_clip": 0.01158934, + "auxiliary_loss_mlp": 0.01143205, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00082147, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.7449489439001713, + "language_loss": 0.76741958, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79044092, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.599731922149658 + }, + { + "auxiliary_loss_clip": 0.01174987, + "auxiliary_loss_mlp": 0.01143272, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00079322, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6777987224832023, + "language_loss": 0.83344156, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85662413, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.5469398498535156 + }, + { + "auxiliary_loss_clip": 0.01158529, + "auxiliary_loss_mlp": 0.01144247, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00090957, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 3.3412246222813944, + "language_loss": 0.73558903, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.75861681, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.511564016342163 + }, + { + "auxiliary_loss_clip": 0.01142106, + "auxiliary_loss_mlp": 0.00748207, + "balance_loss_clip": 1.00204074, + "balance_loss_mlp": 1.00073087, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.5511373278000566, + "language_loss": 0.82985961, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84876275, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.6472008228302 + }, + { + "auxiliary_loss_clip": 0.01158633, + "auxiliary_loss_mlp": 0.01144132, + "balance_loss_clip": 1.00215471, + "balance_loss_mlp": 1.00098598, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.8474402306214879, + "language_loss": 0.74667466, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76970232, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.5707738399505615 + }, + { + "auxiliary_loss_clip": 0.01127047, + "auxiliary_loss_mlp": 0.01143605, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00074482, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 2.084034688569993, + "language_loss": 0.81956851, + "learning_rate": 3.485603206979513e-06, + "loss": 0.84227502, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.5901105403900146 + }, + { + "auxiliary_loss_clip": 0.01109365, + "auxiliary_loss_mlp": 0.01143722, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00105262, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 2.1005999555246424, + "language_loss": 0.79297179, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81550264, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.684947967529297 + }, + { + "auxiliary_loss_clip": 0.01126288, + "auxiliary_loss_mlp": 0.01143873, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.00101256, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 2.0015592107434634, + "language_loss": 0.79187465, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81457627, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.628282308578491 + }, + { + "auxiliary_loss_clip": 0.01126618, + "auxiliary_loss_mlp": 0.0074839, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00074148, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.842087342127364, + "language_loss": 0.68203419, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70078433, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.6625709533691406 + }, + { + "auxiliary_loss_clip": 0.01141208, + "auxiliary_loss_mlp": 0.01143765, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00071383, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 2.730348693248774, + "language_loss": 0.78549993, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80834961, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.5120184421539307 + }, + { + "auxiliary_loss_clip": 0.01110994, + "auxiliary_loss_mlp": 0.01144237, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.0007093, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.46563538098874, + "language_loss": 0.67889965, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70145202, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.7987351417541504 + }, + { + "auxiliary_loss_clip": 0.0115834, + "auxiliary_loss_mlp": 0.00748218, + "balance_loss_clip": 1.00207746, + "balance_loss_mlp": 1.00072908, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.4370097484352031, + "language_loss": 0.87400478, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8930704, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.615452766418457 + }, + { + "auxiliary_loss_clip": 0.01142177, + "auxiliary_loss_mlp": 0.01145002, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.0009023, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.799315079872764, + "language_loss": 0.81728446, + "learning_rate": 3.483776583571541e-06, + "loss": 0.8401562, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.6130502223968506 + }, + { + "auxiliary_loss_clip": 0.01127527, + "auxiliary_loss_mlp": 0.01142808, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00061488, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.5471295594031247, + "language_loss": 0.77464771, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79735106, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.630014419555664 + }, + { + "auxiliary_loss_clip": 0.01142752, + "auxiliary_loss_mlp": 0.01142599, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00059676, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.6426762923143376, + "language_loss": 0.83896846, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86182201, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.63580584526062 + }, + { + "auxiliary_loss_clip": 0.01142515, + "auxiliary_loss_mlp": 0.01144305, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00068188, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 1.9683693612639719, + "language_loss": 0.78489316, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80776131, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.614701271057129 + }, + { + "auxiliary_loss_clip": 0.01158978, + "auxiliary_loss_mlp": 0.01143491, + "balance_loss_clip": 1.00218081, + "balance_loss_mlp": 1.0009172, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 3.2267932381441082, + "language_loss": 0.79062176, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81364644, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.609264373779297 + }, + { + "auxiliary_loss_clip": 0.01174817, + "auxiliary_loss_mlp": 0.01143585, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00082016, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.1865360620582344, + "language_loss": 0.78893936, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81212336, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.4812426567077637 + }, + { + "auxiliary_loss_clip": 0.01141585, + "auxiliary_loss_mlp": 0.01143343, + "balance_loss_clip": 1.00205064, + "balance_loss_mlp": 1.00067353, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.862314296336016, + "language_loss": 0.74964994, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77249926, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 4.008562088012695 + }, + { + "auxiliary_loss_clip": 0.0115825, + "auxiliary_loss_mlp": 0.01143642, + "balance_loss_clip": 1.00203383, + "balance_loss_mlp": 1.00087702, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.4275530829193124, + "language_loss": 0.8523671, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.875386, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.5242457389831543 + }, + { + "auxiliary_loss_clip": 0.01158268, + "auxiliary_loss_mlp": 0.01143511, + "balance_loss_clip": 1.00212443, + "balance_loss_mlp": 1.0008415, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.040775168426519, + "language_loss": 0.78804862, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.81106645, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.533507823944092 + }, + { + "auxiliary_loss_clip": 0.01125987, + "auxiliary_loss_mlp": 0.0114338, + "balance_loss_clip": 1.00202215, + "balance_loss_mlp": 1.00071037, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9806928727812871, + "language_loss": 0.8740654, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89675903, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.629617929458618 + }, + { + "auxiliary_loss_clip": 0.01174871, + "auxiliary_loss_mlp": 0.01143489, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.00081921, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5911339703820697, + "language_loss": 0.70233506, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72551864, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.5136353969573975 + }, + { + "auxiliary_loss_clip": 0.01174851, + "auxiliary_loss_mlp": 0.00748092, + "balance_loss_clip": 1.00227308, + "balance_loss_mlp": 1.00068116, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.7919813382798846, + "language_loss": 0.8024255, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82165492, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.5183956623077393 + }, + { + "auxiliary_loss_clip": 0.01099918, + "auxiliary_loss_mlp": 0.01143629, + "balance_loss_clip": 1.00257826, + "balance_loss_mlp": 1.00086403, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.0526520658461536, + "language_loss": 0.70851541, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.73095089, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.8745062351226807 + }, + { + "auxiliary_loss_clip": 0.01144131, + "auxiliary_loss_mlp": 0.01143457, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.0008831, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.0687636075427296, + "language_loss": 0.58336586, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60624176, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.6175670623779297 + }, + { + "auxiliary_loss_clip": 0.01157835, + "auxiliary_loss_mlp": 0.01144347, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00100946, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 2.0891372151459313, + "language_loss": 0.64274907, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66577089, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 5.444759845733643 + }, + { + "auxiliary_loss_clip": 0.01141672, + "auxiliary_loss_mlp": 0.01143743, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00069141, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.023802455493886, + "language_loss": 0.71815044, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74100459, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.570528268814087 + }, + { + "auxiliary_loss_clip": 0.01124445, + "auxiliary_loss_mlp": 0.01142584, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00086832, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4756854923106992, + "language_loss": 0.77272326, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79539353, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.648348808288574 + }, + { + "auxiliary_loss_clip": 0.01174935, + "auxiliary_loss_mlp": 0.00748153, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.00077808, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.2186770799235136, + "language_loss": 0.85258365, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87181455, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.482250690460205 + }, + { + "auxiliary_loss_clip": 0.01128442, + "auxiliary_loss_mlp": 0.01143374, + "balance_loss_clip": 1.00211751, + "balance_loss_mlp": 1.00089526, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 1.564187442911994, + "language_loss": 0.72202969, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74474788, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.597888708114624 + }, + { + "auxiliary_loss_clip": 0.0117497, + "auxiliary_loss_mlp": 0.01143065, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00077665, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4860758098762563, + "language_loss": 0.81054354, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83372384, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.462597370147705 + }, + { + "auxiliary_loss_clip": 0.01175065, + "auxiliary_loss_mlp": 0.01143368, + "balance_loss_clip": 1.00237823, + "balance_loss_mlp": 1.00060248, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.3268506840840253, + "language_loss": 0.67764622, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.70083052, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.6071836948394775 + }, + { + "auxiliary_loss_clip": 0.01124639, + "auxiliary_loss_mlp": 0.0114337, + "balance_loss_clip": 1.0019902, + "balance_loss_mlp": 1.00098658, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.7845450596643038, + "language_loss": 0.75422668, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77690679, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.6424896717071533 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01143228, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00084448, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.9475868095918685, + "language_loss": 0.80657077, + "learning_rate": 3.478017834441318e-06, + "loss": 0.82927668, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.7600903511047363 + }, + { + "auxiliary_loss_clip": 0.01043923, + "auxiliary_loss_mlp": 0.01144054, + "balance_loss_clip": 1.0017519, + "balance_loss_mlp": 1.00090706, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 1.8568536710950216, + "language_loss": 0.72640371, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74828351, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 2.9271368980407715 + }, + { + "auxiliary_loss_clip": 0.01093525, + "auxiliary_loss_mlp": 0.01144373, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00094044, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.5711183285220571, + "language_loss": 0.86914152, + "learning_rate": 3.477492965085067e-06, + "loss": 0.8915205, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 2.968965530395508 + }, + { + "auxiliary_loss_clip": 0.0117501, + "auxiliary_loss_mlp": 0.01143761, + "balance_loss_clip": 1.00225639, + "balance_loss_mlp": 1.00109112, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.901377289598824, + "language_loss": 0.84482038, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86800814, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.5117790699005127 + }, + { + "auxiliary_loss_clip": 0.0115819, + "auxiliary_loss_mlp": 0.00748128, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00065041, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.7894667714283825, + "language_loss": 0.8340559, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85311902, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.51556134223938 + }, + { + "auxiliary_loss_clip": 0.01142769, + "auxiliary_loss_mlp": 0.01142806, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00089908, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.4023149917272857, + "language_loss": 0.83093202, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.85378778, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.580336809158325 + }, + { + "auxiliary_loss_clip": 0.0115819, + "auxiliary_loss_mlp": 0.01143385, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00071502, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.0066804391431905, + "language_loss": 0.675506, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69852167, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.657209634780884 + }, + { + "auxiliary_loss_clip": 0.01157833, + "auxiliary_loss_mlp": 0.01143581, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00072026, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.4466606203978816, + "language_loss": 0.81695521, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83996934, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.5243961811065674 + }, + { + "auxiliary_loss_clip": 0.01109942, + "auxiliary_loss_mlp": 0.01143527, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.0009532, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.7360341859679422, + "language_loss": 0.92322302, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94575769, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.6241202354431152 + }, + { + "auxiliary_loss_clip": 0.0115958, + "auxiliary_loss_mlp": 0.01143543, + "balance_loss_clip": 1.00217283, + "balance_loss_mlp": 1.00087309, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 2.0672079986960816, + "language_loss": 0.67561257, + "learning_rate": 3.475654158020507e-06, + "loss": 0.6986438, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.579897165298462 + }, + { + "auxiliary_loss_clip": 0.0112498, + "auxiliary_loss_mlp": 0.01143662, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00099242, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.1441913268070807, + "language_loss": 0.72410238, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.7467888, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.6493167877197266 + }, + { + "auxiliary_loss_clip": 0.01116165, + "auxiliary_loss_mlp": 0.00748268, + "balance_loss_clip": 1.0024097, + "balance_loss_mlp": 1.00075316, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.156733773263525, + "language_loss": 0.76003706, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.634674549102783 + }, + { + "auxiliary_loss_clip": 0.01140434, + "auxiliary_loss_mlp": 0.01130813, + "balance_loss_clip": 1.00298285, + "balance_loss_mlp": 1.00035, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8297352309617968, + "language_loss": 0.57091939, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59363186, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.0910887718200684 + }, + { + "auxiliary_loss_clip": 0.01142921, + "auxiliary_loss_mlp": 0.01142748, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00065076, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.9808616910634569, + "language_loss": 0.71565926, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73851597, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.5790157318115234 + }, + { + "auxiliary_loss_clip": 0.01174907, + "auxiliary_loss_mlp": 0.01143453, + "balance_loss_clip": 1.0022068, + "balance_loss_mlp": 1.00097418, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.7343039302114804, + "language_loss": 0.84388161, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86706519, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.4832355976104736 + }, + { + "auxiliary_loss_clip": 0.01158231, + "auxiliary_loss_mlp": 0.01142915, + "balance_loss_clip": 1.0022136, + "balance_loss_mlp": 1.00091267, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.4458824376435053, + "language_loss": 0.84733093, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87034243, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.5462756156921387 + }, + { + "auxiliary_loss_clip": 0.01158494, + "auxiliary_loss_mlp": 0.01144035, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 1.00088847, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.7370799639362635, + "language_loss": 0.77196467, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79499, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.5739903450012207 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01143963, + "balance_loss_clip": 1.00224721, + "balance_loss_mlp": 1.00100768, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.8740565328229892, + "language_loss": 0.72578394, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74866879, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.659114360809326 + }, + { + "auxiliary_loss_clip": 0.01174798, + "auxiliary_loss_mlp": 0.01143527, + "balance_loss_clip": 1.002105, + "balance_loss_mlp": 1.00085723, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.9194712687849695, + "language_loss": 0.70095861, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.7241419, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.471787691116333 + }, + { + "auxiliary_loss_clip": 0.01174798, + "auxiliary_loss_mlp": 0.01143746, + "balance_loss_clip": 1.0021584, + "balance_loss_mlp": 1.00126731, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.4815818567354568, + "language_loss": 0.80176902, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82495439, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.4911465644836426 + }, + { + "auxiliary_loss_clip": 0.01125456, + "auxiliary_loss_mlp": 0.01144038, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00108242, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.6704219755677987, + "language_loss": 0.67007387, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69276881, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.7016942501068115 + }, + { + "auxiliary_loss_clip": 0.01115868, + "auxiliary_loss_mlp": 0.01143191, + "balance_loss_clip": 1.00210428, + "balance_loss_mlp": 1.00118923, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.6040146766085701, + "language_loss": 0.79416907, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81675965, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 4.210346937179565 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01143665, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00080514, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 2.7406183858009117, + "language_loss": 0.77870893, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80128753, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.7471394538879395 + }, + { + "auxiliary_loss_clip": 0.01175009, + "auxiliary_loss_mlp": 0.01144005, + "balance_loss_clip": 1.00226533, + "balance_loss_mlp": 1.00123966, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.4237448124821346, + "language_loss": 0.77982152, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80301172, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.527320146560669 + }, + { + "auxiliary_loss_clip": 0.01174895, + "auxiliary_loss_mlp": 0.01142843, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00074601, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.6764246007264292, + "language_loss": 0.76253104, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78570849, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.5141117572784424 + }, + { + "auxiliary_loss_clip": 0.01142269, + "auxiliary_loss_mlp": 0.01143197, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.0010041, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.9179653097859726, + "language_loss": 0.76793617, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.79079086, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.5901095867156982 + }, + { + "auxiliary_loss_clip": 0.01127763, + "auxiliary_loss_mlp": 0.01143428, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00094914, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.6971220887790672, + "language_loss": 0.71016759, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73287952, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.6063690185546875 + }, + { + "auxiliary_loss_clip": 0.01142979, + "auxiliary_loss_mlp": 0.01143885, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.00083435, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 4.900426667387324, + "language_loss": 0.75090063, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.77376926, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.5530543327331543 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.0114356, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.00108063, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.7733773356012565, + "language_loss": 0.73457515, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75727373, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.6229310035705566 + }, + { + "auxiliary_loss_clip": 0.01158155, + "auxiliary_loss_mlp": 0.00748271, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00059843, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 1.7862115134754175, + "language_loss": 0.66760659, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.6866709, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 5.6064605712890625 + }, + { + "auxiliary_loss_clip": 0.01109091, + "auxiliary_loss_mlp": 0.01143865, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00109959, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.5718507276007085, + "language_loss": 0.70726371, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72979325, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 4.144841194152832 + }, + { + "auxiliary_loss_clip": 0.01158092, + "auxiliary_loss_mlp": 0.01143026, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00073791, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.9733582505880713, + "language_loss": 0.73210055, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75511169, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.01143904, + "auxiliary_loss_mlp": 0.00748076, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00055301, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.7232974700534345, + "language_loss": 0.86714107, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88606083, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.625000238418579 + }, + { + "auxiliary_loss_clip": 0.01174861, + "auxiliary_loss_mlp": 0.00748126, + "balance_loss_clip": 1.00219429, + "balance_loss_mlp": 1.00055337, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.5383869776115973, + "language_loss": 0.80570793, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82493788, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.515850305557251 + }, + { + "auxiliary_loss_clip": 0.01143002, + "auxiliary_loss_mlp": 0.00748036, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.00048327, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.541519121450793, + "language_loss": 0.88218719, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.90109754, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.62926983833313 + }, + { + "auxiliary_loss_clip": 0.01174839, + "auxiliary_loss_mlp": 0.01142584, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00096345, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.4517933712173225, + "language_loss": 0.7786575, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80183172, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.558696985244751 + }, + { + "auxiliary_loss_clip": 0.01174845, + "auxiliary_loss_mlp": 0.01143505, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.00093055, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.5657587459545947, + "language_loss": 0.75441635, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77759981, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.542585849761963 + }, + { + "auxiliary_loss_clip": 0.01147465, + "auxiliary_loss_mlp": 0.01143098, + "balance_loss_clip": 1.00240409, + "balance_loss_mlp": 1.00100017, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.5486952155691331, + "language_loss": 0.69002342, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71292907, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.7031421661376953 + }, + { + "auxiliary_loss_clip": 0.01128325, + "auxiliary_loss_mlp": 0.01143324, + "balance_loss_clip": 1.0020777, + "balance_loss_mlp": 1.00084496, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 1.838498957487131, + "language_loss": 0.79217243, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81488895, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.705728054046631 + }, + { + "auxiliary_loss_clip": 0.01174605, + "auxiliary_loss_mlp": 0.01142551, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00093055, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.6074915061187798, + "language_loss": 0.80708516, + "learning_rate": 3.467742542694501e-06, + "loss": 0.8302567, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.475431203842163 + }, + { + "auxiliary_loss_clip": 0.01142184, + "auxiliary_loss_mlp": 0.01143371, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00079679, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 2.289478993897309, + "language_loss": 0.79646051, + "learning_rate": 3.46747795800024e-06, + "loss": 0.81931603, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.612107276916504 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01130708, + "balance_loss_clip": 1.00355411, + "balance_loss_mlp": 1.00024533, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8546062181644263, + "language_loss": 0.60783595, + "learning_rate": 3.467213317659068e-06, + "loss": 0.63073248, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.1555161476135254 + }, + { + "auxiliary_loss_clip": 0.01126138, + "auxiliary_loss_mlp": 0.01143605, + "balance_loss_clip": 1.0021795, + "balance_loss_mlp": 1.00103116, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 5.129083849762012, + "language_loss": 0.77246499, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79516244, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.6616392135620117 + }, + { + "auxiliary_loss_clip": 0.01141364, + "auxiliary_loss_mlp": 0.011431, + "balance_loss_clip": 1.00193751, + "balance_loss_mlp": 1.00090683, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 1.86994167838927, + "language_loss": 0.74251997, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76536465, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.5464789867401123 + }, + { + "auxiliary_loss_clip": 0.01159213, + "auxiliary_loss_mlp": 0.01143187, + "balance_loss_clip": 1.00220013, + "balance_loss_mlp": 1.0008992, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 3.327049918982269, + "language_loss": 0.80709243, + "learning_rate": 3.466419062854447e-06, + "loss": 0.83011639, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.5914154052734375 + }, + { + "auxiliary_loss_clip": 0.01109435, + "auxiliary_loss_mlp": 0.01142327, + "balance_loss_clip": 1.00182629, + "balance_loss_mlp": 1.00089765, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.6201240413879754, + "language_loss": 0.7654438, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78796148, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.6743195056915283 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.01143205, + "balance_loss_clip": 1.00205338, + "balance_loss_mlp": 1.00091672, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 2.6198346767123457, + "language_loss": 0.82545739, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84785569, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.727966785430908 + }, + { + "auxiliary_loss_clip": 0.0117483, + "auxiliary_loss_mlp": 0.01142791, + "balance_loss_clip": 1.00221276, + "balance_loss_mlp": 1.00088406, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 1.9788839536661684, + "language_loss": 0.76809531, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79127151, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.542491912841797 + }, + { + "auxiliary_loss_clip": 0.01163802, + "auxiliary_loss_mlp": 0.01142571, + "balance_loss_clip": 1.00220847, + "balance_loss_mlp": 1.00066447, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.7964568597177353, + "language_loss": 0.66217446, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68523812, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.7834794521331787 + }, + { + "auxiliary_loss_clip": 0.01095771, + "auxiliary_loss_mlp": 0.01143528, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00104916, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 6.361348908419442, + "language_loss": 0.73235202, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75474507, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.6775290966033936 + }, + { + "auxiliary_loss_clip": 0.01174944, + "auxiliary_loss_mlp": 0.01143473, + "balance_loss_clip": 1.00228155, + "balance_loss_mlp": 1.00089908, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.173042641102552, + "language_loss": 0.86539686, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88858098, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.523747205734253 + }, + { + "auxiliary_loss_clip": 0.011419, + "auxiliary_loss_mlp": 0.01143115, + "balance_loss_clip": 1.00202739, + "balance_loss_mlp": 1.00073099, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 2.476232212512921, + "language_loss": 0.76425159, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78710175, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.5687503814697266 + }, + { + "auxiliary_loss_clip": 0.01158881, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.0008539, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.7695032857392932, + "language_loss": 0.75894195, + "learning_rate": 3.464298604081606e-06, + "loss": 0.7819593, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.5791122913360596 + }, + { + "auxiliary_loss_clip": 0.01127708, + "auxiliary_loss_mlp": 0.01143237, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00075841, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4385040981601669, + "language_loss": 0.73646164, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75917107, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.651884078979492 + }, + { + "auxiliary_loss_clip": 0.01124566, + "auxiliary_loss_mlp": 0.01143966, + "balance_loss_clip": 1.00173759, + "balance_loss_mlp": 1.00101042, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.9720885348216268, + "language_loss": 0.90673929, + "learning_rate": 3.463767933923799e-06, + "loss": 0.92942458, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.6170639991760254 + }, + { + "auxiliary_loss_clip": 0.01163895, + "auxiliary_loss_mlp": 0.01142641, + "balance_loss_clip": 1.00241661, + "balance_loss_mlp": 1.00082994, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.9555776531100517, + "language_loss": 0.80320346, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82626879, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.5014989376068115 + }, + { + "auxiliary_loss_clip": 0.01158037, + "auxiliary_loss_mlp": 0.01143228, + "balance_loss_clip": 1.00203657, + "balance_loss_mlp": 1.00113034, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8482847515074612, + "language_loss": 0.6275233, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.65053594, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.5261940956115723 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01143047, + "balance_loss_clip": 1.00211489, + "balance_loss_mlp": 1.00085449, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.7689672041254547, + "language_loss": 0.84062874, + "learning_rate": 3.462971512415555e-06, + "loss": 0.86365414, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.525162696838379 + }, + { + "auxiliary_loss_clip": 0.01157144, + "auxiliary_loss_mlp": 0.01130172, + "balance_loss_clip": 1.00350451, + "balance_loss_mlp": 1.00047207, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7954548330736999, + "language_loss": 0.70543802, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72831112, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.0303385257720947 + }, + { + "auxiliary_loss_clip": 0.01144017, + "auxiliary_loss_mlp": 0.01143149, + "balance_loss_clip": 1.00216353, + "balance_loss_mlp": 1.00095606, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.9828349821179632, + "language_loss": 0.77281153, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79568315, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 4.1405415534973145 + }, + { + "auxiliary_loss_clip": 0.01111152, + "auxiliary_loss_mlp": 0.01143909, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00123906, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.230270869350941, + "language_loss": 0.68465924, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70720983, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.6938202381134033 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01143131, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00093818, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9539669386000453, + "language_loss": 0.6730305, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69556367, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.647897958755493 + }, + { + "auxiliary_loss_clip": 0.01157222, + "auxiliary_loss_mlp": 0.01129806, + "balance_loss_clip": 1.00355566, + "balance_loss_mlp": 1.00010586, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.9049677721693771, + "language_loss": 0.53203213, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55490237, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 3.034712553024292 + }, + { + "auxiliary_loss_clip": 0.01159691, + "auxiliary_loss_mlp": 0.01143393, + "balance_loss_clip": 1.00227249, + "balance_loss_mlp": 1.00081849, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.9451690402355737, + "language_loss": 0.84001321, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86304408, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.6007635593414307 + }, + { + "auxiliary_loss_clip": 0.01144193, + "auxiliary_loss_mlp": 0.01143609, + "balance_loss_clip": 1.00209677, + "balance_loss_mlp": 1.00074863, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 3.110651680713646, + "language_loss": 0.66863585, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.6915139, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.6340525150299072 + }, + { + "auxiliary_loss_clip": 0.01142581, + "auxiliary_loss_mlp": 0.01143491, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00091624, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 2.034520875499263, + "language_loss": 0.7850579, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80791861, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.6239068508148193 + }, + { + "auxiliary_loss_clip": 0.01142763, + "auxiliary_loss_mlp": 0.011431, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00081182, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.8340524084619692, + "language_loss": 0.68556392, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70842254, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 2.649766445159912 + }, + { + "auxiliary_loss_clip": 0.01158184, + "auxiliary_loss_mlp": 0.01143526, + "balance_loss_clip": 1.0021497, + "balance_loss_mlp": 1.00104666, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.7006992792396056, + "language_loss": 0.84220028, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86521733, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 5.58322286605835 + }, + { + "auxiliary_loss_clip": 0.01126049, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.00207341, + "balance_loss_mlp": 1.00107145, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.7887822380067206, + "language_loss": 0.65790451, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.68059766, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 4.13693904876709 + }, + { + "auxiliary_loss_clip": 0.0114183, + "auxiliary_loss_mlp": 0.01129853, + "balance_loss_clip": 1.00351369, + "balance_loss_mlp": 1.00015378, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8887072791707558, + "language_loss": 0.6110847, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63380146, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.2682368755340576 + }, + { + "auxiliary_loss_clip": 0.01174849, + "auxiliary_loss_mlp": 0.01142997, + "balance_loss_clip": 1.00222826, + "balance_loss_mlp": 1.00080395, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.612417851555274, + "language_loss": 0.72044957, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74362803, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.459742546081543 + }, + { + "auxiliary_loss_clip": 0.01143256, + "auxiliary_loss_mlp": 0.00748186, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00053966, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 2.1126772746916314, + "language_loss": 0.77401835, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79293275, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.6540095806121826 + }, + { + "auxiliary_loss_clip": 0.01174758, + "auxiliary_loss_mlp": 0.01143112, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00091887, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.9152833756916352, + "language_loss": 0.75949061, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78266931, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.4759271144866943 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01142531, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00081527, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 1.8785894217061612, + "language_loss": 0.69738805, + "learning_rate": 3.458715505320736e-06, + "loss": 0.72040629, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.5072450637817383 + }, + { + "auxiliary_loss_clip": 0.01142101, + "auxiliary_loss_mlp": 0.01142312, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00078702, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 3.002488699040455, + "language_loss": 0.78906071, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81190485, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.5739846229553223 + }, + { + "auxiliary_loss_clip": 0.01141715, + "auxiliary_loss_mlp": 0.01143355, + "balance_loss_clip": 1.00201893, + "balance_loss_mlp": 1.00097203, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.1241719522097484, + "language_loss": 0.83587754, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85872829, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.585671901702881 + }, + { + "auxiliary_loss_clip": 0.01159635, + "auxiliary_loss_mlp": 0.01143239, + "balance_loss_clip": 1.00218058, + "balance_loss_mlp": 1.00085568, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.5512277730024933, + "language_loss": 0.71294659, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73597527, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.5167553424835205 + }, + { + "auxiliary_loss_clip": 0.0117407, + "auxiliary_loss_mlp": 0.0112974, + "balance_loss_clip": 1.00373673, + "balance_loss_mlp": 1.00004005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7064833180503286, + "language_loss": 0.56369245, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58673054, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.2241220474243164 + }, + { + "auxiliary_loss_clip": 0.0114153, + "auxiliary_loss_mlp": 0.01142118, + "balance_loss_clip": 1.00204754, + "balance_loss_mlp": 1.00078404, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.5245057963879616, + "language_loss": 0.77586859, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79870504, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.6505215167999268 + }, + { + "auxiliary_loss_clip": 0.01124577, + "auxiliary_loss_mlp": 0.0114281, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00099814, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.5991001488852223, + "language_loss": 0.71445858, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73713243, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.612349033355713 + }, + { + "auxiliary_loss_clip": 0.01141591, + "auxiliary_loss_mlp": 0.01143709, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.0009439, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.733063822032325, + "language_loss": 0.80869329, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83154625, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.655747175216675 + }, + { + "auxiliary_loss_clip": 0.01142776, + "auxiliary_loss_mlp": 0.01142074, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.00073898, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.8783087127777132, + "language_loss": 0.66313684, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68598533, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.678316593170166 + }, + { + "auxiliary_loss_clip": 0.01114535, + "auxiliary_loss_mlp": 0.01143346, + "balance_loss_clip": 1.00223708, + "balance_loss_mlp": 1.00105762, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.872670401687377, + "language_loss": 0.6969955, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71957421, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.647265911102295 + }, + { + "auxiliary_loss_clip": 0.01157782, + "auxiliary_loss_mlp": 0.01142624, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00081277, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.7744651546752093, + "language_loss": 0.78967798, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81268197, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.7999634742736816 + }, + { + "auxiliary_loss_clip": 0.01141535, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00105548, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 1.9694390690680272, + "language_loss": 0.76526141, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78810632, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.585752487182617 + }, + { + "auxiliary_loss_clip": 0.01124983, + "auxiliary_loss_mlp": 0.0114393, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00087833, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.0601780469125566, + "language_loss": 0.77548647, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.79817557, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.6369287967681885 + }, + { + "auxiliary_loss_clip": 0.01143972, + "auxiliary_loss_mlp": 0.01142846, + "balance_loss_clip": 1.00200641, + "balance_loss_mlp": 1.00074816, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 1.9148324847840104, + "language_loss": 0.64297473, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66584289, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.6204490661621094 + }, + { + "auxiliary_loss_clip": 0.0115941, + "auxiliary_loss_mlp": 0.01143036, + "balance_loss_clip": 1.00213802, + "balance_loss_mlp": 1.00093853, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8087770330759472, + "language_loss": 0.82557428, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84859872, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.503465175628662 + }, + { + "auxiliary_loss_clip": 0.0112642, + "auxiliary_loss_mlp": 0.01143805, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00103951, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.0027851933340957, + "language_loss": 0.70090997, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72361231, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.7740564346313477 + }, + { + "auxiliary_loss_clip": 0.01159496, + "auxiliary_loss_mlp": 0.01142768, + "balance_loss_clip": 1.00225782, + "balance_loss_mlp": 1.00086117, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 3.9619710787021902, + "language_loss": 0.69642603, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71944869, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.567061424255371 + }, + { + "auxiliary_loss_clip": 0.0115895, + "auxiliary_loss_mlp": 0.01142505, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.00097942, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.3111318682337174, + "language_loss": 0.70305306, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72606766, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.5837185382843018 + }, + { + "auxiliary_loss_clip": 0.01127248, + "auxiliary_loss_mlp": 0.01142842, + "balance_loss_clip": 1.00203145, + "balance_loss_mlp": 1.00103045, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.799908620409962, + "language_loss": 0.85437584, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87707669, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.610999822616577 + }, + { + "auxiliary_loss_clip": 0.01142155, + "auxiliary_loss_mlp": 0.01142826, + "balance_loss_clip": 1.00210547, + "balance_loss_mlp": 1.00091982, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 3.8729469776979917, + "language_loss": 0.76972502, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79257488, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.5455288887023926 + }, + { + "auxiliary_loss_clip": 0.0115832, + "auxiliary_loss_mlp": 0.01143568, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00127959, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 2.467847104525296, + "language_loss": 0.76439011, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78740901, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.561737537384033 + }, + { + "auxiliary_loss_clip": 0.01174671, + "auxiliary_loss_mlp": 0.01142478, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.0007621, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9861870343142631, + "language_loss": 0.86381984, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88699126, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.5091471672058105 + }, + { + "auxiliary_loss_clip": 0.01157103, + "auxiliary_loss_mlp": 0.0112965, + "balance_loss_clip": 1.00297618, + "balance_loss_mlp": 0.99995005, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8087668536545464, + "language_loss": 0.60382235, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62668991, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.1822879314422607 + }, + { + "auxiliary_loss_clip": 0.01148132, + "auxiliary_loss_mlp": 0.01143214, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00092602, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 3.258542037642506, + "language_loss": 0.7767396, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79965311, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.62569522857666 + }, + { + "auxiliary_loss_clip": 0.01126538, + "auxiliary_loss_mlp": 0.00747185, + "balance_loss_clip": 1.00331116, + "balance_loss_mlp": 1.00047779, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9115219581394709, + "language_loss": 0.58740163, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60613883, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 4.659663200378418 + }, + { + "auxiliary_loss_clip": 0.01159566, + "auxiliary_loss_mlp": 0.01142615, + "balance_loss_clip": 1.00225985, + "balance_loss_mlp": 1.00089884, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.9754577426892106, + "language_loss": 0.68581784, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70883971, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.5478498935699463 + }, + { + "auxiliary_loss_clip": 0.01158266, + "auxiliary_loss_mlp": 0.01143772, + "balance_loss_clip": 1.00218308, + "balance_loss_mlp": 1.00091136, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 2.353995040009847, + "language_loss": 0.83812082, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.8611412, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.5473461151123047 + }, + { + "auxiliary_loss_clip": 0.01142821, + "auxiliary_loss_mlp": 0.01143668, + "balance_loss_clip": 1.00212276, + "balance_loss_mlp": 1.00090289, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.1826578191119155, + "language_loss": 0.70043224, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72329712, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.5734846591949463 + }, + { + "auxiliary_loss_clip": 0.01127171, + "auxiliary_loss_mlp": 0.01142529, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.0007174, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 2.5471230142910954, + "language_loss": 0.86512017, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88781714, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.584277868270874 + }, + { + "auxiliary_loss_clip": 0.01096304, + "auxiliary_loss_mlp": 0.0112997, + "balance_loss_clip": 1.00329852, + "balance_loss_mlp": 1.00027072, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7802380072563001, + "language_loss": 0.55009496, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57235777, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.032390594482422 + }, + { + "auxiliary_loss_clip": 0.01158937, + "auxiliary_loss_mlp": 0.01143133, + "balance_loss_clip": 1.00213575, + "balance_loss_mlp": 1.00103521, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 1.966288071164675, + "language_loss": 0.77950913, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80252981, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.635352611541748 + }, + { + "auxiliary_loss_clip": 0.01157734, + "auxiliary_loss_mlp": 0.01143047, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00066292, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.5954284771290288, + "language_loss": 0.67013741, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69314522, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 3.9642083644866943 + }, + { + "auxiliary_loss_clip": 0.01125911, + "auxiliary_loss_mlp": 0.01143312, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00102389, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.5314277141012502, + "language_loss": 0.86521906, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88791132, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 4.101343870162964 + }, + { + "auxiliary_loss_clip": 0.01142655, + "auxiliary_loss_mlp": 0.01143271, + "balance_loss_clip": 1.00202882, + "balance_loss_mlp": 1.00079226, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.5846639591650526, + "language_loss": 0.76042938, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.7832886, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 3.918966054916382 + }, + { + "auxiliary_loss_clip": 0.01126227, + "auxiliary_loss_mlp": 0.01143515, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.0009402, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.7678908676200664, + "language_loss": 0.8822149, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90491235, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 2.604752540588379 + }, + { + "auxiliary_loss_clip": 0.01130699, + "auxiliary_loss_mlp": 0.01143676, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00100589, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8909838506614272, + "language_loss": 0.78335339, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80609715, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.652416229248047 + }, + { + "auxiliary_loss_clip": 0.01159146, + "auxiliary_loss_mlp": 0.01143192, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00080836, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.948089653996186, + "language_loss": 0.88355684, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90658033, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.5712695121765137 + }, + { + "auxiliary_loss_clip": 0.01143572, + "auxiliary_loss_mlp": 0.01142132, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00070226, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.6950115268626555, + "language_loss": 0.76082063, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78367764, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.577707290649414 + }, + { + "auxiliary_loss_clip": 0.01174842, + "auxiliary_loss_mlp": 0.01143257, + "balance_loss_clip": 1.00225878, + "balance_loss_mlp": 1.00077808, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 2.045775819071507, + "language_loss": 0.70418811, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72736913, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.4890997409820557 + }, + { + "auxiliary_loss_clip": 0.01142392, + "auxiliary_loss_mlp": 0.01143358, + "balance_loss_clip": 1.00201869, + "balance_loss_mlp": 1.0009743, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7752597597618642, + "language_loss": 0.83855587, + "learning_rate": 3.448282246369912e-06, + "loss": 0.86141336, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.571000099182129 + }, + { + "auxiliary_loss_clip": 0.01126761, + "auxiliary_loss_mlp": 0.01142605, + "balance_loss_clip": 1.00196111, + "balance_loss_mlp": 1.00060332, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7913616728924413, + "language_loss": 0.76136261, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78405625, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.7133660316467285 + }, + { + "auxiliary_loss_clip": 0.01127571, + "auxiliary_loss_mlp": 0.01142431, + "balance_loss_clip": 1.00207436, + "balance_loss_mlp": 1.00071478, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8340215284529326, + "language_loss": 0.70924056, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73194057, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.746755838394165 + }, + { + "auxiliary_loss_clip": 0.01157857, + "auxiliary_loss_mlp": 0.01142381, + "balance_loss_clip": 1.0021379, + "balance_loss_mlp": 1.00075996, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.9592895010784819, + "language_loss": 0.73420429, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75720668, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.5711731910705566 + }, + { + "auxiliary_loss_clip": 0.01159334, + "auxiliary_loss_mlp": 0.01142501, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00088024, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.950943431812731, + "language_loss": 0.73601174, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.7590301, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.5423099994659424 + }, + { + "auxiliary_loss_clip": 0.01114613, + "auxiliary_loss_mlp": 0.01142797, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00079536, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 1.8272986348834883, + "language_loss": 0.82449675, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84707081, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.6482443809509277 + }, + { + "auxiliary_loss_clip": 0.01157576, + "auxiliary_loss_mlp": 0.0114269, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00116432, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.8580997227807163, + "language_loss": 0.74716461, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.77016723, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.503180980682373 + }, + { + "auxiliary_loss_clip": 0.0115865, + "auxiliary_loss_mlp": 0.01129664, + "balance_loss_clip": 1.0034647, + "balance_loss_mlp": 0.99996465, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8675558361212893, + "language_loss": 0.56950933, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59239244, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.0537643432617188 + }, + { + "auxiliary_loss_clip": 0.01142625, + "auxiliary_loss_mlp": 0.01142303, + "balance_loss_clip": 1.0022521, + "balance_loss_mlp": 1.00106382, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8363664211625292, + "language_loss": 0.74530745, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76815677, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.6390881538391113 + }, + { + "auxiliary_loss_clip": 0.01110473, + "auxiliary_loss_mlp": 0.01142703, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00070143, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.2053077391412157, + "language_loss": 0.87003136, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.8925631, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.6554439067840576 + }, + { + "auxiliary_loss_clip": 0.01159356, + "auxiliary_loss_mlp": 0.01142915, + "balance_loss_clip": 1.0020957, + "balance_loss_mlp": 1.00091267, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.8422443331516978, + "language_loss": 0.76414686, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78716958, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.5767531394958496 + }, + { + "auxiliary_loss_clip": 0.01141212, + "auxiliary_loss_mlp": 0.011425, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00078356, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.483405286263818, + "language_loss": 0.79722857, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82006574, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.685370683670044 + }, + { + "auxiliary_loss_clip": 0.01159231, + "auxiliary_loss_mlp": 0.011429, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00099325, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.882652593171691, + "language_loss": 0.67083836, + "learning_rate": 3.445055179644071e-06, + "loss": 0.6938597, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.589109182357788 + }, + { + "auxiliary_loss_clip": 0.01174759, + "auxiliary_loss_mlp": 0.0114269, + "balance_loss_clip": 1.00217533, + "balance_loss_mlp": 1.00087905, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.7881586745394746, + "language_loss": 0.79060656, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81378108, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.5938808917999268 + }, + { + "auxiliary_loss_clip": 0.01141273, + "auxiliary_loss_mlp": 0.01142596, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00097573, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 3.249697946439717, + "language_loss": 0.81875086, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84158957, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.5830681324005127 + }, + { + "auxiliary_loss_clip": 0.01157947, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00085139, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 2.5573406159088203, + "language_loss": 0.65782088, + "learning_rate": 3.444247179349548e-06, + "loss": 0.68082035, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.7234368324279785 + }, + { + "auxiliary_loss_clip": 0.01158142, + "auxiliary_loss_mlp": 0.01142843, + "balance_loss_clip": 1.00190318, + "balance_loss_mlp": 1.00093603, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.101926518003104, + "language_loss": 0.74242437, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76543421, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.580432176589966 + }, + { + "auxiliary_loss_clip": 0.01159197, + "auxiliary_loss_mlp": 0.01143145, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00095248, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.7370814255715883, + "language_loss": 0.7799747, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80299807, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.7342312335968018 + }, + { + "auxiliary_loss_clip": 0.01159256, + "auxiliary_loss_mlp": 0.01142672, + "balance_loss_clip": 1.00209463, + "balance_loss_mlp": 1.00095618, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.1706723083560613, + "language_loss": 0.79321444, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81623375, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.50144100189209 + }, + { + "auxiliary_loss_clip": 0.01141163, + "auxiliary_loss_mlp": 0.01142289, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00095475, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.5448709256822635, + "language_loss": 0.80310524, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82593977, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.6268303394317627 + }, + { + "auxiliary_loss_clip": 0.0117481, + "auxiliary_loss_mlp": 0.01143084, + "balance_loss_clip": 1.00225854, + "balance_loss_mlp": 1.00098705, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.5434645567308471, + "language_loss": 0.76950043, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79267937, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.558227062225342 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.01141731, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00077772, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.6480932028745974, + "language_loss": 0.76901966, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79168439, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.6934356689453125 + }, + { + "auxiliary_loss_clip": 0.01126968, + "auxiliary_loss_mlp": 0.00748299, + "balance_loss_clip": 1.0020287, + "balance_loss_mlp": 1.00078368, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 2.2591145234378542, + "language_loss": 0.83213127, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.85088396, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.6012375354766846 + }, + { + "auxiliary_loss_clip": 0.0112632, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_clip": 1.00191581, + "balance_loss_mlp": 1.00070894, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.8062946507251476, + "language_loss": 0.71981871, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74250621, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 4.108172416687012 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.01142956, + "balance_loss_clip": 1.00222695, + "balance_loss_mlp": 1.00076294, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.5831836996267095, + "language_loss": 0.82481074, + "learning_rate": 3.441820222206035e-06, + "loss": 0.847987, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.4699862003326416 + }, + { + "auxiliary_loss_clip": 0.01158058, + "auxiliary_loss_mlp": 0.01142692, + "balance_loss_clip": 1.00210452, + "balance_loss_mlp": 1.00088096, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.3770270094975294, + "language_loss": 0.76342237, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78642988, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.609449863433838 + }, + { + "auxiliary_loss_clip": 0.01112099, + "auxiliary_loss_mlp": 0.01142693, + "balance_loss_clip": 1.00224197, + "balance_loss_mlp": 1.00059557, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 1.8515427835929599, + "language_loss": 0.82200676, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84455466, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 2.7201473712921143 + }, + { + "auxiliary_loss_clip": 0.01159443, + "auxiliary_loss_mlp": 0.01142862, + "balance_loss_clip": 1.00221074, + "balance_loss_mlp": 1.00095546, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.229473275669585, + "language_loss": 0.7664116, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78943467, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.6421499252319336 + }, + { + "auxiliary_loss_clip": 0.0117478, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_clip": 1.00227404, + "balance_loss_mlp": 1.00100267, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.9346171226075874, + "language_loss": 0.8242861, + "learning_rate": 3.440740152620301e-06, + "loss": 0.8474592, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 2.5191538333892822 + }, + { + "auxiliary_loss_clip": 0.01115337, + "auxiliary_loss_mlp": 0.01143325, + "balance_loss_clip": 1.00201476, + "balance_loss_mlp": 1.00132251, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.589885005798475, + "language_loss": 0.87506056, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89764714, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.6974375247955322 + }, + { + "auxiliary_loss_clip": 0.01142368, + "auxiliary_loss_mlp": 0.01142718, + "balance_loss_clip": 1.00176346, + "balance_loss_mlp": 1.00071621, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 2.0962855727664746, + "language_loss": 0.78749955, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81035042, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 5.566511869430542 + }, + { + "auxiliary_loss_clip": 0.01094766, + "auxiliary_loss_mlp": 0.01142401, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00097132, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 3.9484573953168423, + "language_loss": 0.64340127, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66577291, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 2.80849027633667 + }, + { + "auxiliary_loss_clip": 0.01095535, + "auxiliary_loss_mlp": 0.01142782, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00078034, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.9841528617213142, + "language_loss": 0.76317924, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.78556246, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 4.208577632904053 + }, + { + "auxiliary_loss_clip": 0.01091407, + "auxiliary_loss_mlp": 0.01142219, + "balance_loss_clip": 1.00149429, + "balance_loss_mlp": 1.00059879, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.81542549572399, + "language_loss": 0.71779597, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.74013221, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 2.728485107421875 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01142448, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00082743, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 2.0663798917069633, + "language_loss": 0.66846955, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69131976, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 2.5551438331604004 + }, + { + "auxiliary_loss_clip": 0.01159355, + "auxiliary_loss_mlp": 0.01142744, + "balance_loss_clip": 1.00218928, + "balance_loss_mlp": 1.00074184, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5565958231812502, + "language_loss": 0.76658779, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78960878, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.6094913482666016 + }, + { + "auxiliary_loss_clip": 0.01093283, + "auxiliary_loss_mlp": 0.0113045, + "balance_loss_clip": 1.00253797, + "balance_loss_mlp": 0.99998713, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9279096346748498, + "language_loss": 0.61200529, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63424259, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.1619668006896973 + }, + { + "auxiliary_loss_clip": 0.01127396, + "auxiliary_loss_mlp": 0.01142492, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00068092, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.490936690010383, + "language_loss": 0.76276374, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78546262, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.812764883041382 + }, + { + "auxiliary_loss_clip": 0.01157964, + "auxiliary_loss_mlp": 0.01142373, + "balance_loss_clip": 1.00203931, + "balance_loss_mlp": 1.00075197, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.7095371267617308, + "language_loss": 0.80505443, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82805777, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.5837483406066895 + }, + { + "auxiliary_loss_clip": 0.01141555, + "auxiliary_loss_mlp": 0.01143098, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00061965, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.21461480366794, + "language_loss": 0.88808197, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91092849, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.604099750518799 + }, + { + "auxiliary_loss_clip": 0.01158918, + "auxiliary_loss_mlp": 0.01142118, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00068831, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.7126622151368642, + "language_loss": 0.68272555, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70573592, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.5886647701263428 + }, + { + "auxiliary_loss_clip": 0.01157991, + "auxiliary_loss_mlp": 0.01142468, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00075197, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 2.598085092011031, + "language_loss": 0.83413666, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85714126, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.591956615447998 + }, + { + "auxiliary_loss_clip": 0.01126541, + "auxiliary_loss_mlp": 0.01142221, + "balance_loss_clip": 1.00176966, + "balance_loss_mlp": 1.00088668, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.5731572370297897, + "language_loss": 0.84158278, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86427039, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.6336143016815186 + }, + { + "auxiliary_loss_clip": 0.01141951, + "auxiliary_loss_mlp": 0.01143213, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00082898, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 19.20852909310257, + "language_loss": 0.83789122, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86074287, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.592970609664917 + }, + { + "auxiliary_loss_clip": 0.0112587, + "auxiliary_loss_mlp": 0.0114189, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00074625, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.7142571117809817, + "language_loss": 0.80765343, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83033109, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.601998805999756 + }, + { + "auxiliary_loss_clip": 0.01158233, + "auxiliary_loss_mlp": 0.01142484, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00095832, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8521560217497195, + "language_loss": 0.86618435, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88919151, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.587383985519409 + }, + { + "auxiliary_loss_clip": 0.01147076, + "auxiliary_loss_mlp": 0.0114244, + "balance_loss_clip": 1.00223958, + "balance_loss_mlp": 1.00062823, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.0698563806831993, + "language_loss": 0.83293384, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85582894, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.5385990142822266 + }, + { + "auxiliary_loss_clip": 0.01157951, + "auxiliary_loss_mlp": 0.01142803, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.0011822, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.6490944018987759, + "language_loss": 0.79528517, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81829274, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.5340676307678223 + }, + { + "auxiliary_loss_clip": 0.01141707, + "auxiliary_loss_mlp": 0.01143428, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00094962, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.5809630643880364, + "language_loss": 0.72759068, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75044203, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.6013753414154053 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01142104, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.0007695, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.4627684178852576, + "language_loss": 0.73517734, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75786221, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.6285786628723145 + }, + { + "auxiliary_loss_clip": 0.01158185, + "auxiliary_loss_mlp": 0.01143058, + "balance_loss_clip": 1.00212359, + "balance_loss_mlp": 1.00096107, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 4.739628037445998, + "language_loss": 0.71134365, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73435611, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.547075033187866 + }, + { + "auxiliary_loss_clip": 0.01127591, + "auxiliary_loss_mlp": 0.01143687, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00101709, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.0162510226797177, + "language_loss": 0.78867304, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81138587, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.6187684535980225 + }, + { + "auxiliary_loss_clip": 0.01123184, + "auxiliary_loss_mlp": 0.01130652, + "balance_loss_clip": 1.00242388, + "balance_loss_mlp": 1.00018978, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8919515253085261, + "language_loss": 0.58561563, + "learning_rate": 3.434241401387739e-06, + "loss": 0.608154, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.1872799396514893 + }, + { + "auxiliary_loss_clip": 0.01116656, + "auxiliary_loss_mlp": 0.01142242, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00090742, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 1.9090674365851452, + "language_loss": 0.85305429, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87564325, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.65112042427063 + }, + { + "auxiliary_loss_clip": 0.01159135, + "auxiliary_loss_mlp": 0.01142643, + "balance_loss_clip": 1.00206709, + "balance_loss_mlp": 1.00092697, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.7915745198381203, + "language_loss": 0.68313634, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70615411, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.578340530395508 + }, + { + "auxiliary_loss_clip": 0.01131611, + "auxiliary_loss_mlp": 0.011431, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00109816, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.8291677450720125, + "language_loss": 0.67128456, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69403166, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.6336967945098877 + }, + { + "auxiliary_loss_clip": 0.01124533, + "auxiliary_loss_mlp": 0.01142305, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00087523, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.6429938403417081, + "language_loss": 0.69278246, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71545088, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.659740447998047 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01142345, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00053406, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4381169280667487, + "language_loss": 0.77581942, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79868156, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.5784075260162354 + }, + { + "auxiliary_loss_clip": 0.0115824, + "auxiliary_loss_mlp": 0.01142817, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00091004, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.842891562395397, + "language_loss": 0.70937335, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73238397, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.5879621505737305 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01129842, + "balance_loss_clip": 1.00266075, + "balance_loss_mlp": 1.00014246, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6761923545253791, + "language_loss": 0.53067797, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55338663, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.301823377609253 + }, + { + "auxiliary_loss_clip": 0.01142542, + "auxiliary_loss_mlp": 0.01142254, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00082397, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 6.351405142920667, + "language_loss": 0.73914564, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76199359, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 3.9560160636901855 + }, + { + "auxiliary_loss_clip": 0.01142785, + "auxiliary_loss_mlp": 0.00748423, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00108123, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 4.777431440025802, + "language_loss": 0.80628949, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82520163, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.530306577682495 + }, + { + "auxiliary_loss_clip": 0.01173364, + "auxiliary_loss_mlp": 0.01129854, + "balance_loss_clip": 1.00283313, + "balance_loss_mlp": 1.00015473, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8418274709921741, + "language_loss": 0.59534609, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61837828, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.149775743484497 + }, + { + "auxiliary_loss_clip": 0.01174721, + "auxiliary_loss_mlp": 0.01142926, + "balance_loss_clip": 1.0020957, + "balance_loss_mlp": 1.00092387, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.593936229443824, + "language_loss": 0.81494319, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83811975, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.4966323375701904 + }, + { + "auxiliary_loss_clip": 0.01141123, + "auxiliary_loss_mlp": 0.01141562, + "balance_loss_clip": 1.00167406, + "balance_loss_mlp": 1.00070477, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6855875995176435, + "language_loss": 0.82610488, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84893179, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.606323003768921 + }, + { + "auxiliary_loss_clip": 0.01158119, + "auxiliary_loss_mlp": 0.01142158, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00082374, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 1.8894232302115674, + "language_loss": 0.69517636, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71817911, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.593517780303955 + }, + { + "auxiliary_loss_clip": 0.01174783, + "auxiliary_loss_mlp": 0.01142711, + "balance_loss_clip": 1.00221181, + "balance_loss_mlp": 1.00099528, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.5038074346812773, + "language_loss": 0.67852044, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70169538, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.568751573562622 + }, + { + "auxiliary_loss_clip": 0.01141403, + "auxiliary_loss_mlp": 0.01142104, + "balance_loss_clip": 1.00191033, + "balance_loss_mlp": 1.00086522, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.7959866484741431, + "language_loss": 0.83022237, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85305744, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 2.557159662246704 + }, + { + "auxiliary_loss_clip": 0.01163787, + "auxiliary_loss_mlp": 0.01142341, + "balance_loss_clip": 1.00228357, + "balance_loss_mlp": 1.00110209, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 1.8699144988829968, + "language_loss": 0.69742811, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72048938, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 5.318705320358276 + }, + { + "auxiliary_loss_clip": 0.01124683, + "auxiliary_loss_mlp": 0.00748509, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00098085, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 2.893357817648507, + "language_loss": 0.73652864, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75526053, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.6479411125183105 + }, + { + "auxiliary_loss_clip": 0.01141465, + "auxiliary_loss_mlp": 0.01142471, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00075531, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.8491839462041528, + "language_loss": 0.80829161, + "learning_rate": 3.429346772085922e-06, + "loss": 0.83113098, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 3.9712767601013184 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.01142668, + "balance_loss_clip": 1.00171363, + "balance_loss_mlp": 1.00085616, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 3.4821390587145715, + "language_loss": 0.65200353, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67451936, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 2.7854456901550293 + }, + { + "auxiliary_loss_clip": 0.01159227, + "auxiliary_loss_mlp": 0.0114229, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00086045, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 3.842958153998663, + "language_loss": 0.80705976, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83007497, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.573793411254883 + }, + { + "auxiliary_loss_clip": 0.01142769, + "auxiliary_loss_mlp": 0.00748513, + "balance_loss_clip": 1.00212884, + "balance_loss_mlp": 1.0010736, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.0221551462998106, + "language_loss": 0.80936205, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82827491, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.5720503330230713 + }, + { + "auxiliary_loss_clip": 0.01126054, + "auxiliary_loss_mlp": 0.01143128, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00093496, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.4426489930988666, + "language_loss": 0.77762407, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80031592, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.623080015182495 + }, + { + "auxiliary_loss_clip": 0.01159382, + "auxiliary_loss_mlp": 0.01142215, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00078487, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.6867403535600267, + "language_loss": 0.73842001, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76143599, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.6193997859954834 + }, + { + "auxiliary_loss_clip": 0.01142794, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_clip": 1.00213516, + "balance_loss_mlp": 1.00082183, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.897838290949624, + "language_loss": 0.72401416, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.7468636, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.593330144882202 + }, + { + "auxiliary_loss_clip": 0.01159413, + "auxiliary_loss_mlp": 0.01143305, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.0011121, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.035353572447624, + "language_loss": 0.86583072, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8888579, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.526799201965332 + }, + { + "auxiliary_loss_clip": 0.01157719, + "auxiliary_loss_mlp": 0.0114251, + "balance_loss_clip": 1.00188863, + "balance_loss_mlp": 1.00088978, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 2.7470070915149676, + "language_loss": 0.66591996, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68892229, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.6315219402313232 + }, + { + "auxiliary_loss_clip": 0.01144127, + "auxiliary_loss_mlp": 0.01143198, + "balance_loss_clip": 1.00202632, + "balance_loss_mlp": 1.00119567, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 3.529831911177434, + "language_loss": 0.73101479, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75388801, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.531383991241455 + }, + { + "auxiliary_loss_clip": 0.01174698, + "auxiliary_loss_mlp": 0.01142933, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00093102, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.9165575843278062, + "language_loss": 0.84038699, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.8635633, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.4879391193389893 + }, + { + "auxiliary_loss_clip": 0.01127204, + "auxiliary_loss_mlp": 0.01143201, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.0008173, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.748848654727277, + "language_loss": 0.71680361, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73950773, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.6171658039093018 + }, + { + "auxiliary_loss_clip": 0.01076769, + "auxiliary_loss_mlp": 0.01142571, + "balance_loss_clip": 1.00190389, + "balance_loss_mlp": 1.00095022, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6910551851808402, + "language_loss": 0.84092796, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86312133, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.7580254077911377 + }, + { + "auxiliary_loss_clip": 0.01141765, + "auxiliary_loss_mlp": 0.0114329, + "balance_loss_clip": 1.00203359, + "balance_loss_mlp": 1.00109696, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.865748680713164, + "language_loss": 0.89759982, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92045033, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.536250352859497 + }, + { + "auxiliary_loss_clip": 0.01093829, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.00160074, + "balance_loss_mlp": 1.0007267, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 2.794931127939977, + "language_loss": 0.73564804, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75800598, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.8312950134277344 + }, + { + "auxiliary_loss_clip": 0.01174827, + "auxiliary_loss_mlp": 0.01142808, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00090122, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.570598557369082, + "language_loss": 0.74649525, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76967156, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.46140193939209 + }, + { + "auxiliary_loss_clip": 0.01142604, + "auxiliary_loss_mlp": 0.01142099, + "balance_loss_clip": 1.00193751, + "balance_loss_mlp": 1.00086021, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.7800296647187728, + "language_loss": 0.89100784, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91385484, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.5888161659240723 + }, + { + "auxiliary_loss_clip": 0.01159032, + "auxiliary_loss_mlp": 0.0114236, + "balance_loss_clip": 1.0021745, + "balance_loss_mlp": 1.0010252, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.4808418185693917, + "language_loss": 0.711128, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73414195, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.5610084533691406 + }, + { + "auxiliary_loss_clip": 0.0114113, + "auxiliary_loss_mlp": 0.01142509, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00088811, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 1.9030360178741446, + "language_loss": 0.86361277, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88644922, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.6093497276306152 + }, + { + "auxiliary_loss_clip": 0.01174562, + "auxiliary_loss_mlp": 0.01142433, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00109816, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.6772545764704851, + "language_loss": 0.76362896, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78679895, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.484107494354248 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01129868, + "balance_loss_clip": 1.00304163, + "balance_loss_mlp": 1.0001688, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7208898800591413, + "language_loss": 0.50212508, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52515817, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.147919178009033 + }, + { + "auxiliary_loss_clip": 0.01113526, + "auxiliary_loss_mlp": 0.01142301, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00087106, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.7781115410415311, + "language_loss": 0.72413099, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.7466892, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.630256175994873 + }, + { + "auxiliary_loss_clip": 0.01126913, + "auxiliary_loss_mlp": 0.01129939, + "balance_loss_clip": 1.00289297, + "balance_loss_mlp": 1.00023961, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7515202452710354, + "language_loss": 0.59205008, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61461866, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.1944162845611572 + }, + { + "auxiliary_loss_clip": 0.01142225, + "auxiliary_loss_mlp": 0.01142265, + "balance_loss_clip": 1.00193536, + "balance_loss_mlp": 1.00074005, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.102219115719953, + "language_loss": 0.73719919, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.7600441, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.6049139499664307 + }, + { + "auxiliary_loss_clip": 0.01143672, + "auxiliary_loss_mlp": 0.01142271, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00093627, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 2.916916575285031, + "language_loss": 0.81204045, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83489984, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.536290407180786 + }, + { + "auxiliary_loss_clip": 0.01125844, + "auxiliary_loss_mlp": 0.01143262, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00106931, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.634676458952525, + "language_loss": 0.72460961, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74730062, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.6454360485076904 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01142558, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00074625, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.6355497301553152, + "language_loss": 0.68412954, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70696759, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.759700059890747 + }, + { + "auxiliary_loss_clip": 0.01110741, + "auxiliary_loss_mlp": 0.01141962, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00081837, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.53118887613315, + "language_loss": 0.67695749, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.69948447, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 4.093077659606934 + }, + { + "auxiliary_loss_clip": 0.01158781, + "auxiliary_loss_mlp": 0.01142384, + "balance_loss_clip": 1.00212359, + "balance_loss_mlp": 1.00114489, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 2.0050681583673646, + "language_loss": 0.75523174, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77824342, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.6344172954559326 + }, + { + "auxiliary_loss_clip": 0.0117452, + "auxiliary_loss_mlp": 0.01142397, + "balance_loss_clip": 1.00201786, + "balance_loss_mlp": 1.00096667, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 3.1555887481862093, + "language_loss": 0.73745888, + "learning_rate": 3.42142406835758e-06, + "loss": 0.7606281, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.4839329719543457 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01142019, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00078011, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.140029845558904, + "language_loss": 0.80569685, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82853925, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.6300227642059326 + }, + { + "auxiliary_loss_clip": 0.01141404, + "auxiliary_loss_mlp": 0.01129073, + "balance_loss_clip": 1.00305915, + "balance_loss_mlp": 1.00013626, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7305987698675381, + "language_loss": 0.50834137, + "learning_rate": 3.420876001185698e-06, + "loss": 0.53104615, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.078019142150879 + }, + { + "auxiliary_loss_clip": 0.01099796, + "auxiliary_loss_mlp": 0.01141627, + "balance_loss_clip": 1.00200188, + "balance_loss_mlp": 1.00086474, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 2.2538115672965984, + "language_loss": 0.74836588, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.77078015, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.7295262813568115 + }, + { + "auxiliary_loss_clip": 0.01157757, + "auxiliary_loss_mlp": 0.01141109, + "balance_loss_clip": 1.00192606, + "balance_loss_mlp": 1.00072861, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7100952790439086, + "language_loss": 0.7147485, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73773718, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.525657892227173 + }, + { + "auxiliary_loss_clip": 0.01157859, + "auxiliary_loss_mlp": 0.0114189, + "balance_loss_clip": 1.00215077, + "balance_loss_mlp": 1.0009371, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 5.790221975721591, + "language_loss": 0.7072655, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.73026294, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.499831438064575 + }, + { + "auxiliary_loss_clip": 0.01147079, + "auxiliary_loss_mlp": 0.01142276, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00094199, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.2860493020252814, + "language_loss": 0.81140614, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83429968, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 3.9861865043640137 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.0114161, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00084794, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.5068174823950338, + "language_loss": 0.80747342, + "learning_rate": 3.419504890542124e-06, + "loss": 0.83063447, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 4.123296022415161 + }, + { + "auxiliary_loss_clip": 0.01142498, + "auxiliary_loss_mlp": 0.01141366, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.00069904, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.7687263102049595, + "language_loss": 0.88035619, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90319479, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.5415239334106445 + }, + { + "auxiliary_loss_clip": 0.01142162, + "auxiliary_loss_mlp": 0.01142121, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00097716, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.8427907852904755, + "language_loss": 0.92020065, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94304341, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 3.961228132247925 + }, + { + "auxiliary_loss_clip": 0.01111108, + "auxiliary_loss_mlp": 0.0114321, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00130296, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.0664779851848154, + "language_loss": 0.73959798, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76214117, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 2.618785858154297 + }, + { + "auxiliary_loss_clip": 0.0115768, + "auxiliary_loss_mlp": 0.01141776, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00091851, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 1.9394313820059157, + "language_loss": 0.76242816, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78542268, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 2.5239312648773193 + }, + { + "auxiliary_loss_clip": 0.01128561, + "auxiliary_loss_mlp": 0.01141535, + "balance_loss_clip": 1.00200713, + "balance_loss_mlp": 1.00077307, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2291929795578134, + "language_loss": 0.76962984, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.7923308, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.6532154083251953 + }, + { + "auxiliary_loss_clip": 0.0114276, + "auxiliary_loss_mlp": 0.01142273, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00084376, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.7847505149590719, + "language_loss": 0.68260413, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70545447, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.7329533100128174 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01141055, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00076985, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.598385994717067, + "language_loss": 0.75489277, + "learning_rate": 3.417583075166451e-06, + "loss": 0.77788019, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 2.5590662956237793 + }, + { + "auxiliary_loss_clip": 0.01159356, + "auxiliary_loss_mlp": 0.0114233, + "balance_loss_clip": 1.00212109, + "balance_loss_mlp": 1.00099552, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.650487383670185, + "language_loss": 0.76166081, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78467762, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.606109142303467 + }, + { + "auxiliary_loss_clip": 0.01143917, + "auxiliary_loss_mlp": 0.01142474, + "balance_loss_clip": 1.00201452, + "balance_loss_mlp": 1.00104403, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.2303188999362673, + "language_loss": 0.74935549, + "learning_rate": 3.417033501108875e-06, + "loss": 0.77221936, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.5495123863220215 + }, + { + "auxiliary_loss_clip": 0.01174601, + "auxiliary_loss_mlp": 0.01141534, + "balance_loss_clip": 1.0021646, + "balance_loss_mlp": 1.00077152, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 3.010539975717856, + "language_loss": 0.72549635, + "learning_rate": 3.416758633473798e-06, + "loss": 0.7486577, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.5058648586273193 + }, + { + "auxiliary_loss_clip": 0.01142462, + "auxiliary_loss_mlp": 0.01141618, + "balance_loss_clip": 1.00203264, + "balance_loss_mlp": 1.00076056, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 2.074203301168897, + "language_loss": 0.74206066, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.7649014, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.5745739936828613 + }, + { + "auxiliary_loss_clip": 0.01174578, + "auxiliary_loss_mlp": 0.01141547, + "balance_loss_clip": 1.00211465, + "balance_loss_mlp": 1.00107121, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.5268642719523653, + "language_loss": 0.76571149, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78887272, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.531261444091797 + }, + { + "auxiliary_loss_clip": 0.01159358, + "auxiliary_loss_mlp": 0.0114104, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00113606, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.859015258198295, + "language_loss": 0.82139522, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84439921, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.544990301132202 + }, + { + "auxiliary_loss_clip": 0.0117468, + "auxiliary_loss_mlp": 0.01141994, + "balance_loss_clip": 1.0020746, + "balance_loss_mlp": 1.00075507, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 4.123976681472121, + "language_loss": 0.76941675, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79258347, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.5122482776641846 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.00748309, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00073385, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.283856336423394, + "language_loss": 0.81682587, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83557069, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.5822501182556152 + }, + { + "auxiliary_loss_clip": 0.01144162, + "auxiliary_loss_mlp": 0.01141139, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00094914, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 2.193777913394334, + "language_loss": 0.77471578, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.7975688, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.619137763977051 + }, + { + "auxiliary_loss_clip": 0.01140889, + "auxiliary_loss_mlp": 0.01141958, + "balance_loss_clip": 1.00169337, + "balance_loss_mlp": 1.00119591, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 1.9476187733599548, + "language_loss": 0.8209542, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.8437826, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.5591492652893066 + }, + { + "auxiliary_loss_clip": 0.01157846, + "auxiliary_loss_mlp": 0.01141533, + "balance_loss_clip": 1.00194037, + "balance_loss_mlp": 1.00105643, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.8531349489853493, + "language_loss": 0.91544259, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93843639, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.4987854957580566 + }, + { + "auxiliary_loss_clip": 0.01157362, + "auxiliary_loss_mlp": 0.011423, + "balance_loss_clip": 1.00189507, + "balance_loss_mlp": 1.00106072, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 1.9581973667530161, + "language_loss": 0.76220453, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78520119, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.5429375171661377 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.0114107, + "balance_loss_clip": 1.00172317, + "balance_loss_mlp": 1.00078487, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.779588152646532, + "language_loss": 0.89145851, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91412646, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.5598504543304443 + }, + { + "auxiliary_loss_clip": 0.01142232, + "auxiliary_loss_mlp": 0.011411, + "balance_loss_clip": 1.00189841, + "balance_loss_mlp": 1.00071919, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 1.7158922592760035, + "language_loss": 0.70896709, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73180044, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.6115496158599854 + }, + { + "auxiliary_loss_clip": 0.01140839, + "auxiliary_loss_mlp": 0.01142181, + "balance_loss_clip": 1.00165462, + "balance_loss_mlp": 1.00075066, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.6883551759300994, + "language_loss": 0.91421562, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93704581, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.6125810146331787 + }, + { + "auxiliary_loss_clip": 0.01142448, + "auxiliary_loss_mlp": 0.01141831, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00097287, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 2.1814451798129744, + "language_loss": 0.73345828, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75630105, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.6437416076660156 + }, + { + "auxiliary_loss_clip": 0.01159192, + "auxiliary_loss_mlp": 0.01141518, + "balance_loss_clip": 1.00204885, + "balance_loss_mlp": 1.00075531, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.7575336374456099, + "language_loss": 0.71750224, + "learning_rate": 3.41290485034781e-06, + "loss": 0.74050939, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.661911725997925 + }, + { + "auxiliary_loss_clip": 0.01142563, + "auxiliary_loss_mlp": 0.01141742, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00088453, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.039482142793919, + "language_loss": 0.78137589, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80421895, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.530791759490967 + }, + { + "auxiliary_loss_clip": 0.01157526, + "auxiliary_loss_mlp": 0.01141605, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.00103295, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4480571418429755, + "language_loss": 0.90274715, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92573845, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.5463356971740723 + }, + { + "auxiliary_loss_clip": 0.01142409, + "auxiliary_loss_mlp": 0.01141677, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.0010097, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.3328493008655045, + "language_loss": 0.88134784, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90418869, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.5425047874450684 + }, + { + "auxiliary_loss_clip": 0.01158658, + "auxiliary_loss_mlp": 0.00748272, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00072575, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 3.0955844806784656, + "language_loss": 0.81941199, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.83848131, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.529733419418335 + }, + { + "auxiliary_loss_clip": 0.01141838, + "auxiliary_loss_mlp": 0.01141238, + "balance_loss_clip": 1.00184381, + "balance_loss_mlp": 1.00085747, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.7079106753486517, + "language_loss": 0.79456675, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81739753, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.573686361312866 + }, + { + "auxiliary_loss_clip": 0.01142469, + "auxiliary_loss_mlp": 0.01141415, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00103402, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.3230713641533542, + "language_loss": 0.89949149, + "learning_rate": 3.411250012687582e-06, + "loss": 0.92233032, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 3.957710027694702 + }, + { + "auxiliary_loss_clip": 0.01142712, + "auxiliary_loss_mlp": 0.00748193, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00074303, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.350813162690678, + "language_loss": 0.63221025, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65111935, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.552239418029785 + }, + { + "auxiliary_loss_clip": 0.01141091, + "auxiliary_loss_mlp": 0.01141546, + "balance_loss_clip": 1.0017246, + "balance_loss_mlp": 1.00116563, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.603114042856047, + "language_loss": 0.69840997, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72123635, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.7015304565429688 + }, + { + "auxiliary_loss_clip": 0.01148179, + "auxiliary_loss_mlp": 0.01129039, + "balance_loss_clip": 1.00424302, + "balance_loss_mlp": 1.00010228, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7181819624294403, + "language_loss": 0.61555362, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63832569, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.2166662216186523 + }, + { + "auxiliary_loss_clip": 0.01067335, + "auxiliary_loss_mlp": 0.0114223, + "balance_loss_clip": 1.00215137, + "balance_loss_mlp": 1.00118172, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 2.622643888325744, + "language_loss": 0.64832103, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67041671, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.8600597381591797 + }, + { + "auxiliary_loss_clip": 0.01141955, + "auxiliary_loss_mlp": 0.00748141, + "balance_loss_clip": 1.00199866, + "balance_loss_mlp": 1.00078714, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 1.9525369185276318, + "language_loss": 0.78175509, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.80065608, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.6672983169555664 + }, + { + "auxiliary_loss_clip": 0.01141176, + "auxiliary_loss_mlp": 0.01141484, + "balance_loss_clip": 1.00216591, + "balance_loss_mlp": 1.00110364, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 2.02544757521512, + "language_loss": 0.82690465, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84973127, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 4.059281826019287 + }, + { + "auxiliary_loss_clip": 0.01157738, + "auxiliary_loss_mlp": 0.01141577, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00071931, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.3663164569459343, + "language_loss": 0.70751107, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73050427, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 4.217862606048584 + }, + { + "auxiliary_loss_clip": 0.01124505, + "auxiliary_loss_mlp": 0.01141101, + "balance_loss_clip": 1.00171638, + "balance_loss_mlp": 1.00081527, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.7959492602748974, + "language_loss": 0.79054761, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81320369, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01141055, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00086546, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.6214634711180294, + "language_loss": 0.70459771, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72726262, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 4.02248740196228 + }, + { + "auxiliary_loss_clip": 0.011413, + "auxiliary_loss_mlp": 0.01141429, + "balance_loss_clip": 1.00207818, + "balance_loss_mlp": 1.000857, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 1.9359658846079357, + "language_loss": 0.71905434, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74188161, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.569964647293091 + }, + { + "auxiliary_loss_clip": 0.01157496, + "auxiliary_loss_mlp": 0.0114066, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.0007565, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5355459018196764, + "language_loss": 0.5885334, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61151493, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 2.5629758834838867 + }, + { + "auxiliary_loss_clip": 0.01141063, + "auxiliary_loss_mlp": 0.0114147, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.0007081, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.9028601250967165, + "language_loss": 0.73895526, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76178062, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.550931692123413 + }, + { + "auxiliary_loss_clip": 0.01157804, + "auxiliary_loss_mlp": 0.01141614, + "balance_loss_clip": 1.00212443, + "balance_loss_mlp": 1.00075674, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 2.3129911150022138, + "language_loss": 0.7785145, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80150867, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.543184995651245 + }, + { + "auxiliary_loss_clip": 0.01159218, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00092006, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7819179985286755, + "language_loss": 0.82023978, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84325641, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.5537917613983154 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01141365, + "balance_loss_clip": 1.00209022, + "balance_loss_mlp": 1.00079334, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 2.068546825140472, + "language_loss": 0.72689337, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74943984, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.7083263397216797 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.0114062, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00090659, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.3333451722040173, + "language_loss": 0.68135208, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70416623, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.5740654468536377 + }, + { + "auxiliary_loss_clip": 0.01143073, + "auxiliary_loss_mlp": 0.01141169, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00107455, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.7244317938556095, + "language_loss": 0.71763438, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74047685, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.5915441513061523 + }, + { + "auxiliary_loss_clip": 0.01140795, + "auxiliary_loss_mlp": 0.01141146, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00095594, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.6981232373364723, + "language_loss": 0.81139326, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83421266, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.642057180404663 + }, + { + "auxiliary_loss_clip": 0.01174455, + "auxiliary_loss_mlp": 0.01141728, + "balance_loss_clip": 1.00218153, + "balance_loss_mlp": 1.00096619, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.8805466709152252, + "language_loss": 0.75209522, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77525711, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.519035816192627 + }, + { + "auxiliary_loss_clip": 0.01174464, + "auxiliary_loss_mlp": 0.01141308, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.000736, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.8550795951979004, + "language_loss": 0.74498922, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76814693, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.500732660293579 + }, + { + "auxiliary_loss_clip": 0.01126795, + "auxiliary_loss_mlp": 0.01142409, + "balance_loss_clip": 1.00203228, + "balance_loss_mlp": 1.00097895, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.6584674504317742, + "language_loss": 0.62592232, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64861441, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.647894859313965 + }, + { + "auxiliary_loss_clip": 0.01142269, + "auxiliary_loss_mlp": 0.01141622, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00085926, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 2.1634000503307123, + "language_loss": 0.78521812, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80805707, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.7588579654693604 + }, + { + "auxiliary_loss_clip": 0.01110506, + "auxiliary_loss_mlp": 0.01141611, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00113463, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 1.931301666974219, + "language_loss": 0.68514371, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70766491, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.6611852645874023 + }, + { + "auxiliary_loss_clip": 0.01158538, + "auxiliary_loss_mlp": 0.01141175, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00108004, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.6555213532893398, + "language_loss": 0.61044317, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63344032, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.6072916984558105 + }, + { + "auxiliary_loss_clip": 0.01159062, + "auxiliary_loss_mlp": 0.01141555, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.00069773, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.8080615284388069, + "language_loss": 0.82924032, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85224652, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.5754709243774414 + }, + { + "auxiliary_loss_clip": 0.01157978, + "auxiliary_loss_mlp": 0.01141995, + "balance_loss_clip": 1.00233269, + "balance_loss_mlp": 1.00066018, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.724046475520157, + "language_loss": 0.68336403, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7063638, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.5489041805267334 + }, + { + "auxiliary_loss_clip": 0.01143633, + "auxiliary_loss_mlp": 0.01141183, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00089777, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.1199116023714706, + "language_loss": 0.71220076, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73504889, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.536754608154297 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01128397, + "balance_loss_clip": 1.00369596, + "balance_loss_mlp": 1.0002228, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7320644432624334, + "language_loss": 0.55831343, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.58084702, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.322897434234619 + }, + { + "auxiliary_loss_clip": 0.0111034, + "auxiliary_loss_mlp": 0.01142, + "balance_loss_clip": 1.00190461, + "balance_loss_mlp": 1.00104702, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.602467539131952, + "language_loss": 0.77404386, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79656726, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.649677276611328 + }, + { + "auxiliary_loss_clip": 0.01174317, + "auxiliary_loss_mlp": 0.01140372, + "balance_loss_clip": 1.00217474, + "balance_loss_mlp": 1.00075459, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.5176578882338008, + "language_loss": 0.81335205, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83649898, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.5142674446105957 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01140825, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00072992, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.6920638595696411, + "language_loss": 0.79404294, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81702954, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.5227210521698 + }, + { + "auxiliary_loss_clip": 0.01108883, + "auxiliary_loss_mlp": 0.01141199, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.001104, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.876571163188626, + "language_loss": 0.73995894, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76245975, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.6777944564819336 + }, + { + "auxiliary_loss_clip": 0.01141786, + "auxiliary_loss_mlp": 0.01140602, + "balance_loss_clip": 1.00199306, + "balance_loss_mlp": 1.00079381, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.593705304745472, + "language_loss": 0.71780014, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74062407, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.725593090057373 + }, + { + "auxiliary_loss_clip": 0.01125539, + "auxiliary_loss_mlp": 0.00748185, + "balance_loss_clip": 1.00180233, + "balance_loss_mlp": 1.00084782, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.502186916700616, + "language_loss": 0.73253578, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75127304, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.6488327980041504 + }, + { + "auxiliary_loss_clip": 0.01140943, + "auxiliary_loss_mlp": 0.01140614, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00080526, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 1.9863656678402384, + "language_loss": 0.75595117, + "learning_rate": 3.401558468884188e-06, + "loss": 0.77876669, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.6042141914367676 + }, + { + "auxiliary_loss_clip": 0.01141224, + "auxiliary_loss_mlp": 0.0114192, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00087166, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.4390630362828751, + "language_loss": 0.65964407, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68247557, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.6420369148254395 + }, + { + "auxiliary_loss_clip": 0.01126099, + "auxiliary_loss_mlp": 0.0114134, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00124526, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 2.0207882532447887, + "language_loss": 0.79778218, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82045656, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.6829333305358887 + }, + { + "auxiliary_loss_clip": 0.01157571, + "auxiliary_loss_mlp": 0.01140886, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00088692, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4424552454886033, + "language_loss": 0.67819035, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.70117491, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 4.089800596237183 + }, + { + "auxiliary_loss_clip": 0.01147098, + "auxiliary_loss_mlp": 0.01141251, + "balance_loss_clip": 1.00259233, + "balance_loss_mlp": 1.00096595, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.667534875836624, + "language_loss": 0.78361279, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80649626, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.5928335189819336 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01140999, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00090408, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.605775942519105, + "language_loss": 0.84495497, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86747921, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.6688361167907715 + }, + { + "auxiliary_loss_clip": 0.01157689, + "auxiliary_loss_mlp": 0.01141386, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00100517, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.6017763038315473, + "language_loss": 0.67074245, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69373322, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.5613372325897217 + }, + { + "auxiliary_loss_clip": 0.0111171, + "auxiliary_loss_mlp": 0.01140381, + "balance_loss_clip": 1.00183368, + "balance_loss_mlp": 1.00085878, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.6394157522610018, + "language_loss": 0.77229047, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79481137, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.6602325439453125 + }, + { + "auxiliary_loss_clip": 0.01157846, + "auxiliary_loss_mlp": 0.00748162, + "balance_loss_clip": 1.00215733, + "balance_loss_mlp": 1.00079632, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.6694572912235859, + "language_loss": 0.72212756, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74118757, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 4.0714380741119385 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01140801, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00080204, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 2.8847397138974293, + "language_loss": 0.80488223, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82769954, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 3.9758012294769287 + }, + { + "auxiliary_loss_clip": 0.01158984, + "auxiliary_loss_mlp": 0.01140644, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00083542, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.7201651091508314, + "language_loss": 0.83255857, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85555482, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.5348451137542725 + }, + { + "auxiliary_loss_clip": 0.01126857, + "auxiliary_loss_mlp": 0.01140226, + "balance_loss_clip": 1.00193512, + "balance_loss_mlp": 1.00079846, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3240822303889537, + "language_loss": 0.75262064, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77529144, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.661684036254883 + }, + { + "auxiliary_loss_clip": 0.01163868, + "auxiliary_loss_mlp": 0.01140685, + "balance_loss_clip": 1.00289941, + "balance_loss_mlp": 1.00106668, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.590258406481725, + "language_loss": 0.88567936, + "learning_rate": 3.398220643612143e-06, + "loss": 0.9087249, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 4.052324295043945 + }, + { + "auxiliary_loss_clip": 0.01157904, + "auxiliary_loss_mlp": 0.01141113, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00092244, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.5356685925886644, + "language_loss": 0.71244252, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73543262, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.6730494499206543 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01140956, + "balance_loss_clip": 1.00174284, + "balance_loss_mlp": 1.00105226, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 2.7920895998945294, + "language_loss": 0.80023992, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82289201, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.6326382160186768 + }, + { + "auxiliary_loss_clip": 0.01158678, + "auxiliary_loss_mlp": 0.0074735, + "balance_loss_clip": 1.00386071, + "balance_loss_mlp": 1.0006398, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7258899220595392, + "language_loss": 0.61655194, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63561219, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.127629280090332 + }, + { + "auxiliary_loss_clip": 0.01157928, + "auxiliary_loss_mlp": 0.01141213, + "balance_loss_clip": 1.00203502, + "balance_loss_mlp": 1.0008316, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 2.547683406245688, + "language_loss": 0.77342409, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79641557, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.6070339679718018 + }, + { + "auxiliary_loss_clip": 0.01157509, + "auxiliary_loss_mlp": 0.01140598, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00088477, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.4221539043974183, + "language_loss": 0.91297108, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93595219, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.525942802429199 + }, + { + "auxiliary_loss_clip": 0.01157916, + "auxiliary_loss_mlp": 0.01142066, + "balance_loss_clip": 1.0023042, + "balance_loss_mlp": 1.0011133, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.9635782794124776, + "language_loss": 0.69288933, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71588916, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.5434343814849854 + }, + { + "auxiliary_loss_clip": 0.01142491, + "auxiliary_loss_mlp": 0.01141735, + "balance_loss_clip": 1.00206339, + "balance_loss_mlp": 1.00078177, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.5481891102117897, + "language_loss": 0.63909483, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.661937, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.6637027263641357 + }, + { + "auxiliary_loss_clip": 0.01174298, + "auxiliary_loss_mlp": 0.01140868, + "balance_loss_clip": 1.00220072, + "balance_loss_mlp": 1.00105965, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 1.955203153784305, + "language_loss": 0.86199653, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88514829, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.4781107902526855 + }, + { + "auxiliary_loss_clip": 0.01174524, + "auxiliary_loss_mlp": 0.01141225, + "balance_loss_clip": 1.00221968, + "balance_loss_mlp": 1.00084424, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 3.1652095004578973, + "language_loss": 0.79977328, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82293075, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.477621078491211 + }, + { + "auxiliary_loss_clip": 0.01143541, + "auxiliary_loss_mlp": 0.01141429, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00104785, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.7012122891810826, + "language_loss": 0.78611821, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80896789, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.595402240753174 + }, + { + "auxiliary_loss_clip": 0.01124587, + "auxiliary_loss_mlp": 0.01141308, + "balance_loss_clip": 1.00183976, + "balance_loss_mlp": 1.00102234, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 2.1891582832325427, + "language_loss": 0.73453224, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75719118, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.582261323928833 + }, + { + "auxiliary_loss_clip": 0.01157661, + "auxiliary_loss_mlp": 0.01141064, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.00096893, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.60501939046799, + "language_loss": 0.79622835, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81921554, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.5679855346679688 + }, + { + "auxiliary_loss_clip": 0.01141978, + "auxiliary_loss_mlp": 0.01141617, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00114059, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.2664258310141445, + "language_loss": 0.76657641, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.78941238, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.542593479156494 + }, + { + "auxiliary_loss_clip": 0.01141055, + "auxiliary_loss_mlp": 0.01140755, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00104189, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7775240291389818, + "language_loss": 0.81880021, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.8416183, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.5560879707336426 + }, + { + "auxiliary_loss_clip": 0.01108614, + "auxiliary_loss_mlp": 0.01140511, + "balance_loss_clip": 1.00175035, + "balance_loss_mlp": 1.00079799, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.8644170409942569, + "language_loss": 0.70294976, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72544098, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.663753032684326 + }, + { + "auxiliary_loss_clip": 0.01156994, + "auxiliary_loss_mlp": 0.01128234, + "balance_loss_clip": 1.00364149, + "balance_loss_mlp": 1.00006032, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7001124684199999, + "language_loss": 0.57172471, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59457695, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.2314934730529785 + }, + { + "auxiliary_loss_clip": 0.01141182, + "auxiliary_loss_mlp": 0.01141721, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00114965, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.4751919953279677, + "language_loss": 0.69148749, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71431655, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.6090385913848877 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.01140882, + "balance_loss_clip": 1.00188243, + "balance_loss_mlp": 1.00088215, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.494864815003493, + "language_loss": 0.69744945, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72043663, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.6090571880340576 + }, + { + "auxiliary_loss_clip": 0.01093646, + "auxiliary_loss_mlp": 0.01140858, + "balance_loss_clip": 1.00171423, + "balance_loss_mlp": 1.00076318, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.6715924600717396, + "language_loss": 0.7284472, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75079226, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.7591593265533447 + }, + { + "auxiliary_loss_clip": 0.0112846, + "auxiliary_loss_mlp": 0.01141308, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.00102246, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.215135773642819, + "language_loss": 0.83545196, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.85814965, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.7602837085723877 + }, + { + "auxiliary_loss_clip": 0.01094794, + "auxiliary_loss_mlp": 0.00748311, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00089622, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.0798673090443303, + "language_loss": 0.69173121, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71016228, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.7323670387268066 + }, + { + "auxiliary_loss_clip": 0.01174327, + "auxiliary_loss_mlp": 0.01140244, + "balance_loss_clip": 1.00218964, + "balance_loss_mlp": 1.0010078, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.4564521456760144, + "language_loss": 0.73735249, + "learning_rate": 3.392081480737698e-06, + "loss": 0.76049817, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.4975485801696777 + }, + { + "auxiliary_loss_clip": 0.01157608, + "auxiliary_loss_mlp": 0.00748049, + "balance_loss_clip": 1.00203323, + "balance_loss_mlp": 1.0007211, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.482474897498093, + "language_loss": 0.66561908, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68467569, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.5308735370635986 + }, + { + "auxiliary_loss_clip": 0.01110742, + "auxiliary_loss_mlp": 0.01141366, + "balance_loss_clip": 1.00188148, + "balance_loss_mlp": 1.00098479, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.834933069614218, + "language_loss": 0.79468298, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81720412, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.7242631912231445 + }, + { + "auxiliary_loss_clip": 0.01159048, + "auxiliary_loss_mlp": 0.01141725, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.0009625, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.5238295260120065, + "language_loss": 0.80560207, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82860982, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.531708240509033 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01141429, + "balance_loss_clip": 1.00190389, + "balance_loss_mlp": 1.00085711, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.2714956003335325, + "language_loss": 0.63362706, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65630567, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.621612787246704 + }, + { + "auxiliary_loss_clip": 0.01157995, + "auxiliary_loss_mlp": 0.01141124, + "balance_loss_clip": 1.00197303, + "balance_loss_mlp": 1.00093341, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 2.138256384278534, + "language_loss": 0.82590634, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84889758, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.5242555141448975 + }, + { + "auxiliary_loss_clip": 0.01174371, + "auxiliary_loss_mlp": 0.01141076, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00107706, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.322341870353143, + "language_loss": 0.76825702, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79141152, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 4.010560989379883 + }, + { + "auxiliary_loss_clip": 0.01174489, + "auxiliary_loss_mlp": 0.0114121, + "balance_loss_clip": 1.00225759, + "balance_loss_mlp": 1.00102007, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 2.0717554670876623, + "language_loss": 0.84623873, + "learning_rate": 3.390122747388459e-06, + "loss": 0.86939567, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.5395123958587646 + }, + { + "auxiliary_loss_clip": 0.0114175, + "auxiliary_loss_mlp": 0.01140501, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.0007875, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.3864485989474038, + "language_loss": 0.76956272, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79238522, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.589320421218872 + }, + { + "auxiliary_loss_clip": 0.01127303, + "auxiliary_loss_mlp": 0.01140729, + "balance_loss_clip": 1.00185287, + "balance_loss_mlp": 1.0007298, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.8425775389084904, + "language_loss": 0.78460205, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80728239, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.6199657917022705 + }, + { + "auxiliary_loss_clip": 0.01126023, + "auxiliary_loss_mlp": 0.01141371, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00108552, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.0937189531223477, + "language_loss": 0.87309444, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89576828, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.623647451400757 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.01141594, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00092721, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 3.1408846258605854, + "language_loss": 0.81540525, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83790314, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.6306231021881104 + }, + { + "auxiliary_loss_clip": 0.01125971, + "auxiliary_loss_mlp": 0.01141351, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00087428, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.0107717295184586, + "language_loss": 0.81383741, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.8365106, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 4.008370399475098 + }, + { + "auxiliary_loss_clip": 0.01142402, + "auxiliary_loss_mlp": 0.00748189, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00086236, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.3000807209902288, + "language_loss": 0.76626915, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78517503, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.5319876670837402 + }, + { + "auxiliary_loss_clip": 0.01143791, + "auxiliary_loss_mlp": 0.01140476, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.0009532, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.9965632722238493, + "language_loss": 0.70316106, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72600377, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 3.9728434085845947 + }, + { + "auxiliary_loss_clip": 0.01110566, + "auxiliary_loss_mlp": 0.01141575, + "balance_loss_clip": 1.00199509, + "balance_loss_mlp": 1.00090826, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.5082624878737834, + "language_loss": 0.92320687, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94572818, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.629751682281494 + }, + { + "auxiliary_loss_clip": 0.01174418, + "auxiliary_loss_mlp": 0.01141488, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00082111, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.9164896098476203, + "language_loss": 0.85421258, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87737167, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 3.94921875 + }, + { + "auxiliary_loss_clip": 0.01127857, + "auxiliary_loss_mlp": 0.01139801, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00085044, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 7.280091274677049, + "language_loss": 0.79524165, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81791824, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 2.629244327545166 + }, + { + "auxiliary_loss_clip": 0.01127459, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00087678, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4910050301579267, + "language_loss": 0.84598583, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86865968, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.6687350273132324 + }, + { + "auxiliary_loss_clip": 0.01142021, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00076401, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.229767900292832, + "language_loss": 0.8070488, + "learning_rate": 3.386758911459485e-06, + "loss": 0.82987761, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 2.5720677375793457 + }, + { + "auxiliary_loss_clip": 0.01174568, + "auxiliary_loss_mlp": 0.01141733, + "balance_loss_clip": 1.00227797, + "balance_loss_mlp": 1.00106645, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 1.9064366624986377, + "language_loss": 0.70755982, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73072278, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.5468451976776123 + }, + { + "auxiliary_loss_clip": 0.01157076, + "auxiliary_loss_mlp": 0.01139961, + "balance_loss_clip": 1.00213933, + "balance_loss_mlp": 1.0010103, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.7507115294353428, + "language_loss": 0.82605678, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84902716, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.485959768295288 + }, + { + "auxiliary_loss_clip": 0.01141771, + "auxiliary_loss_mlp": 0.01140214, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00088215, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.6093369105128843, + "language_loss": 0.87660444, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89942425, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.6085567474365234 + }, + { + "auxiliary_loss_clip": 0.01142428, + "auxiliary_loss_mlp": 0.01142131, + "balance_loss_clip": 1.00222123, + "balance_loss_mlp": 1.0010829, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.6738389421734503, + "language_loss": 0.76884377, + "learning_rate": 3.38563594915581e-06, + "loss": 0.7916894, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.5814011096954346 + }, + { + "auxiliary_loss_clip": 0.01174351, + "auxiliary_loss_mlp": 0.01140989, + "balance_loss_clip": 1.00205946, + "balance_loss_mlp": 1.00118077, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.6673949198608171, + "language_loss": 0.65299773, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67615116, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.516602039337158 + }, + { + "auxiliary_loss_clip": 0.01159286, + "auxiliary_loss_mlp": 0.01141937, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00088918, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 4.512825667872598, + "language_loss": 0.83449078, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85750306, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.5155184268951416 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01140883, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00107479, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.4505850920178056, + "language_loss": 0.7593528, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78219569, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.651970148086548 + }, + { + "auxiliary_loss_clip": 0.01159109, + "auxiliary_loss_mlp": 0.01141098, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00128913, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.484425425106031, + "language_loss": 0.71762967, + "learning_rate": 3.38451214615691e-06, + "loss": 0.7406317, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.548196792602539 + }, + { + "auxiliary_loss_clip": 0.01158688, + "auxiliary_loss_mlp": 0.01141016, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.00082612, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 1.6357304127049692, + "language_loss": 0.65695649, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67995358, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.604844570159912 + }, + { + "auxiliary_loss_clip": 0.01157658, + "auxiliary_loss_mlp": 0.01140631, + "balance_loss_clip": 1.00205946, + "balance_loss_mlp": 1.00082254, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 4.607307229776551, + "language_loss": 0.71979988, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74278283, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.533081531524658 + }, + { + "auxiliary_loss_clip": 0.01127, + "auxiliary_loss_mlp": 0.01141416, + "balance_loss_clip": 1.00201023, + "balance_loss_mlp": 1.00084484, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.6644836753289833, + "language_loss": 0.7496525, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77233666, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.651819944381714 + }, + { + "auxiliary_loss_clip": 0.01128162, + "auxiliary_loss_mlp": 0.01141541, + "balance_loss_clip": 1.00209582, + "balance_loss_mlp": 1.00106442, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 2.0998930299922742, + "language_loss": 0.85693169, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87962866, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.615963935852051 + }, + { + "auxiliary_loss_clip": 0.0112617, + "auxiliary_loss_mlp": 0.01141074, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00107503, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.7605204011668119, + "language_loss": 0.83075809, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85343057, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.65024471282959 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01140968, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00087309, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.7973328125236858, + "language_loss": 0.79293156, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81593049, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.508153200149536 + }, + { + "auxiliary_loss_clip": 0.01141687, + "auxiliary_loss_mlp": 0.01128328, + "balance_loss_clip": 1.00363684, + "balance_loss_mlp": 1.00015378, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7725432589729506, + "language_loss": 0.62236905, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64506924, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.1741950511932373 + }, + { + "auxiliary_loss_clip": 0.01140886, + "auxiliary_loss_mlp": 0.01140161, + "balance_loss_clip": 1.00213468, + "balance_loss_mlp": 1.00092411, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.7407239533658876, + "language_loss": 0.89545906, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91826952, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.645606756210327 + }, + { + "auxiliary_loss_clip": 0.01158711, + "auxiliary_loss_mlp": 0.01141702, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00103533, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6648806408425314, + "language_loss": 0.87202537, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89502954, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.573585033416748 + }, + { + "auxiliary_loss_clip": 0.01157765, + "auxiliary_loss_mlp": 0.01141403, + "balance_loss_clip": 1.00205553, + "balance_loss_mlp": 1.00083113, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 6.454642373374736, + "language_loss": 0.73367846, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75667012, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.5951414108276367 + }, + { + "auxiliary_loss_clip": 0.01124649, + "auxiliary_loss_mlp": 0.01141004, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.0011003, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 4.70691563919306, + "language_loss": 0.80829978, + "learning_rate": 3.381417358643549e-06, + "loss": 0.83095628, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.655449151992798 + }, + { + "auxiliary_loss_clip": 0.01139123, + "auxiliary_loss_mlp": 0.00747278, + "balance_loss_clip": 1.00244915, + "balance_loss_mlp": 1.00042045, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.8186179733437, + "language_loss": 0.58827996, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.607144, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.191884994506836 + }, + { + "auxiliary_loss_clip": 0.0115773, + "auxiliary_loss_mlp": 0.01140847, + "balance_loss_clip": 1.00202, + "balance_loss_mlp": 1.00084758, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.597874206133586, + "language_loss": 0.74217224, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.765158, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.619642972946167 + }, + { + "auxiliary_loss_clip": 0.01174492, + "auxiliary_loss_mlp": 0.0114112, + "balance_loss_clip": 1.00223064, + "balance_loss_mlp": 1.00121641, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.360299929045382, + "language_loss": 0.79576766, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81892383, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.6665005683898926 + }, + { + "auxiliary_loss_clip": 0.01140571, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_clip": 1.00199234, + "balance_loss_mlp": 1.00107586, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 1.9230569559518498, + "language_loss": 0.78816915, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81098557, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.580117702484131 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01141412, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00084043, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 1.905484639260726, + "language_loss": 0.81149447, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83401167, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.659080982208252 + }, + { + "auxiliary_loss_clip": 0.01125454, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.00173581, + "balance_loss_mlp": 1.0006094, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.839399847651706, + "language_loss": 0.81249833, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83123457, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 4.108349800109863 + }, + { + "auxiliary_loss_clip": 0.01141361, + "auxiliary_loss_mlp": 0.01140489, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00087142, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.5834503961655477, + "language_loss": 0.83202291, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85484141, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.598402738571167 + }, + { + "auxiliary_loss_clip": 0.01126499, + "auxiliary_loss_mlp": 0.01141059, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00086904, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.8645879726435888, + "language_loss": 0.63855076, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66122627, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.0115913, + "auxiliary_loss_mlp": 0.01141025, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00073957, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.6677882403302795, + "language_loss": 0.78357875, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80658025, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.544731855392456 + }, + { + "auxiliary_loss_clip": 0.01125443, + "auxiliary_loss_mlp": 0.01141428, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00095201, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8082136724392948, + "language_loss": 0.79484403, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81751275, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.6520631313323975 + }, + { + "auxiliary_loss_clip": 0.01125525, + "auxiliary_loss_mlp": 0.01140556, + "balance_loss_clip": 1.00202811, + "balance_loss_mlp": 1.00093794, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.8484063436357756, + "language_loss": 0.80840689, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.83106768, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 4.09376072883606 + }, + { + "auxiliary_loss_clip": 0.01142174, + "auxiliary_loss_mlp": 0.01141891, + "balance_loss_clip": 1.00228298, + "balance_loss_mlp": 1.00131917, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5253065875202942, + "language_loss": 0.7892223, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81206292, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.7140612602233887 + }, + { + "auxiliary_loss_clip": 0.01142992, + "auxiliary_loss_mlp": 0.01141279, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00089788, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 1.6995235841284273, + "language_loss": 0.7016269, + "learning_rate": 3.377751711782227e-06, + "loss": 0.7244696, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 4.007396221160889 + }, + { + "auxiliary_loss_clip": 0.01147146, + "auxiliary_loss_mlp": 0.01141571, + "balance_loss_clip": 1.00265861, + "balance_loss_mlp": 1.00099897, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.619492686832293, + "language_loss": 0.77646589, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79935294, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.5846750736236572 + }, + { + "auxiliary_loss_clip": 0.01130355, + "auxiliary_loss_mlp": 0.01140181, + "balance_loss_clip": 1.00238109, + "balance_loss_mlp": 1.00084889, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.8453772040339598, + "language_loss": 0.79269731, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81540269, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.588190793991089 + }, + { + "auxiliary_loss_clip": 0.01158902, + "auxiliary_loss_mlp": 0.011409, + "balance_loss_clip": 1.00212574, + "balance_loss_mlp": 1.00099611, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.6801285143760474, + "language_loss": 0.80925554, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83225358, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 3.962587594985962 + }, + { + "auxiliary_loss_clip": 0.01125668, + "auxiliary_loss_mlp": 0.01141285, + "balance_loss_clip": 1.00206482, + "balance_loss_mlp": 1.00099969, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.141707333049943, + "language_loss": 0.84621245, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86888194, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.643895149230957 + }, + { + "auxiliary_loss_clip": 0.01127142, + "auxiliary_loss_mlp": 0.0074806, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00068974, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.9146783530053995, + "language_loss": 0.79755068, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81630272, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.671658754348755 + }, + { + "auxiliary_loss_clip": 0.01098335, + "auxiliary_loss_mlp": 0.01140976, + "balance_loss_clip": 1.00229561, + "balance_loss_mlp": 1.00078583, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.520517000600251, + "language_loss": 0.75984043, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78223348, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.721863269805908 + }, + { + "auxiliary_loss_clip": 0.01157677, + "auxiliary_loss_mlp": 0.0114118, + "balance_loss_clip": 1.00201988, + "balance_loss_mlp": 1.0010848, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 3.3814149569356595, + "language_loss": 0.78447998, + "learning_rate": 3.375774243322725e-06, + "loss": 0.80746853, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.536630868911743 + }, + { + "auxiliary_loss_clip": 0.01125571, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00106585, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 1.724197845494373, + "language_loss": 0.79282403, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81548947, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.6398816108703613 + }, + { + "auxiliary_loss_clip": 0.01159109, + "auxiliary_loss_mlp": 0.01140442, + "balance_loss_clip": 1.00231922, + "balance_loss_mlp": 1.00082409, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.916084175960714, + "language_loss": 0.75073594, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77373147, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.5921308994293213 + }, + { + "auxiliary_loss_clip": 0.01143261, + "auxiliary_loss_mlp": 0.01141573, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.0010972, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.9588219997618945, + "language_loss": 0.7459265, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.76877475, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.581942081451416 + }, + { + "auxiliary_loss_clip": 0.01157535, + "auxiliary_loss_mlp": 0.01140716, + "balance_loss_clip": 1.00202453, + "balance_loss_mlp": 1.00081182, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.2550865881588127, + "language_loss": 0.72193408, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74491656, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.535600423812866 + }, + { + "auxiliary_loss_clip": 0.01159046, + "auxiliary_loss_mlp": 0.01141436, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00095975, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.7730754012589132, + "language_loss": 0.77650034, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79950511, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.525888204574585 + }, + { + "auxiliary_loss_clip": 0.01174475, + "auxiliary_loss_mlp": 0.01141068, + "balance_loss_clip": 1.0021106, + "balance_loss_mlp": 1.00078213, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.8748343446667293, + "language_loss": 0.70267606, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72583151, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.4938220977783203 + }, + { + "auxiliary_loss_clip": 0.01174336, + "auxiliary_loss_mlp": 0.01140195, + "balance_loss_clip": 1.00223994, + "balance_loss_mlp": 1.00086355, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6567041090246364, + "language_loss": 0.70552093, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72866625, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.483722686767578 + }, + { + "auxiliary_loss_clip": 0.01159211, + "auxiliary_loss_mlp": 0.01141324, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.0009433, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.4772374578467213, + "language_loss": 0.63640881, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65941417, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.5669822692871094 + }, + { + "auxiliary_loss_clip": 0.01157791, + "auxiliary_loss_mlp": 0.01140499, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00107145, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.5790953525391844, + "language_loss": 0.70835948, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73134243, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.5610203742980957 + }, + { + "auxiliary_loss_clip": 0.01157577, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00072587, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.719000106021548, + "language_loss": 0.75151086, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77448809, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.525526523590088 + }, + { + "auxiliary_loss_clip": 0.01174228, + "auxiliary_loss_mlp": 0.01140264, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00083661, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.5403776017799287, + "language_loss": 0.77102876, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79417372, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.5374412536621094 + }, + { + "auxiliary_loss_clip": 0.0115767, + "auxiliary_loss_mlp": 0.01140358, + "balance_loss_clip": 1.00213897, + "balance_loss_mlp": 1.00083554, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 1.8202980058102731, + "language_loss": 0.74063563, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76361585, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.5846195220947266 + }, + { + "auxiliary_loss_clip": 0.01174273, + "auxiliary_loss_mlp": 0.0113987, + "balance_loss_clip": 1.00210524, + "balance_loss_mlp": 1.00082397, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 2.188850416352144, + "language_loss": 0.8038007, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82694215, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.563448190689087 + }, + { + "auxiliary_loss_clip": 0.0110923, + "auxiliary_loss_mlp": 0.0114147, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00089872, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.8243861114382594, + "language_loss": 0.76345938, + "learning_rate": 3.371811641167852e-06, + "loss": 0.7859664, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.682508707046509 + }, + { + "auxiliary_loss_clip": 0.0110811, + "auxiliary_loss_mlp": 0.01140074, + "balance_loss_clip": 1.00166678, + "balance_loss_mlp": 1.00074172, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.743583738908865, + "language_loss": 0.76156676, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78404856, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.6581122875213623 + }, + { + "auxiliary_loss_clip": 0.01140891, + "auxiliary_loss_mlp": 0.01140327, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00090027, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.4022734874177463, + "language_loss": 0.75841558, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78122777, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.6489670276641846 + }, + { + "auxiliary_loss_clip": 0.01142413, + "auxiliary_loss_mlp": 0.011413, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00110984, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.7758335987057725, + "language_loss": 0.63158858, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65442568, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.5521738529205322 + }, + { + "auxiliary_loss_clip": 0.01140843, + "auxiliary_loss_mlp": 0.01140395, + "balance_loss_clip": 1.00193846, + "balance_loss_mlp": 1.00115895, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.081917061571618, + "language_loss": 0.76185352, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78466594, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.5798404216766357 + }, + { + "auxiliary_loss_clip": 0.01131894, + "auxiliary_loss_mlp": 0.01140807, + "balance_loss_clip": 1.00309765, + "balance_loss_mlp": 1.00090301, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.2154157370780223, + "language_loss": 0.78550696, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80823398, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.561647653579712 + }, + { + "auxiliary_loss_clip": 0.0112671, + "auxiliary_loss_mlp": 0.01140812, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00081253, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.664689137758259, + "language_loss": 0.77963799, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80231321, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.600508213043213 + }, + { + "auxiliary_loss_clip": 0.01174419, + "auxiliary_loss_mlp": 0.00748215, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00065732, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.7659688734195782, + "language_loss": 0.88317072, + "learning_rate": 3.369826514835332e-06, + "loss": 0.90239704, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.6347506046295166 + }, + { + "auxiliary_loss_clip": 0.01142089, + "auxiliary_loss_mlp": 0.01141422, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00094581, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 1.9102385805574016, + "language_loss": 0.81832528, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84116042, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.617495059967041 + }, + { + "auxiliary_loss_clip": 0.01130595, + "auxiliary_loss_mlp": 0.01140472, + "balance_loss_clip": 1.00249064, + "balance_loss_mlp": 1.00085378, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.363018551229229, + "language_loss": 0.74485421, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76756489, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.67332124710083 + }, + { + "auxiliary_loss_clip": 0.01125901, + "auxiliary_loss_mlp": 0.0114092, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00073028, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.5874640154166484, + "language_loss": 0.77749223, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.80016041, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 4.237722158432007 + }, + { + "auxiliary_loss_clip": 0.01157573, + "auxiliary_loss_mlp": 0.01140599, + "balance_loss_clip": 1.00212359, + "balance_loss_mlp": 1.00069499, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.529414211740083, + "language_loss": 0.66773659, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69071835, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.586595058441162 + }, + { + "auxiliary_loss_clip": 0.01141951, + "auxiliary_loss_mlp": 0.0114143, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.0009532, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.3817803882624826, + "language_loss": 0.75004852, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77288234, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.564419984817505 + }, + { + "auxiliary_loss_clip": 0.01131389, + "auxiliary_loss_mlp": 0.01140566, + "balance_loss_clip": 1.00226676, + "balance_loss_mlp": 1.00075722, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.5208008820699734, + "language_loss": 0.62400603, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64672565, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.8026862144470215 + }, + { + "auxiliary_loss_clip": 0.01127124, + "auxiliary_loss_mlp": 0.0114067, + "balance_loss_clip": 1.00210762, + "balance_loss_mlp": 1.00086153, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.48899296796528, + "language_loss": 0.73175609, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75443405, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.71234393119812 + }, + { + "auxiliary_loss_clip": 0.01174218, + "auxiliary_loss_mlp": 0.0114043, + "balance_loss_clip": 1.00210106, + "balance_loss_mlp": 1.00090778, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.6006114632143662, + "language_loss": 0.75266433, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77581078, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 3.9518065452575684 + }, + { + "auxiliary_loss_clip": 0.0115919, + "auxiliary_loss_mlp": 0.01140467, + "balance_loss_clip": 1.00217772, + "balance_loss_mlp": 1.00065827, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.7638806238941043, + "language_loss": 0.80137312, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.82436967, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.509749412536621 + }, + { + "auxiliary_loss_clip": 0.0114105, + "auxiliary_loss_mlp": 0.01140683, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00106525, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.8151496958000677, + "language_loss": 0.81203365, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83485103, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 4.050337076187134 + }, + { + "auxiliary_loss_clip": 0.01078185, + "auxiliary_loss_mlp": 0.01140256, + "balance_loss_clip": 1.00171542, + "balance_loss_mlp": 1.00073326, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.000436378384626, + "language_loss": 0.73352432, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75570875, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.7637453079223633 + }, + { + "auxiliary_loss_clip": 0.01174176, + "auxiliary_loss_mlp": 0.01140616, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00080752, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.9106849791316483, + "language_loss": 0.78323019, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80637813, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 3.8771109580993652 + }, + { + "auxiliary_loss_clip": 0.011424, + "auxiliary_loss_mlp": 0.0114099, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00118089, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.587889030678688, + "language_loss": 0.69418114, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71701503, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.700551986694336 + }, + { + "auxiliary_loss_clip": 0.01126838, + "auxiliary_loss_mlp": 0.01140126, + "balance_loss_clip": 1.00187218, + "balance_loss_mlp": 1.00098455, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 2.2634243946041472, + "language_loss": 0.70396835, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72663802, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.6228067874908447 + }, + { + "auxiliary_loss_clip": 0.01156122, + "auxiliary_loss_mlp": 0.01127448, + "balance_loss_clip": 1.00304222, + "balance_loss_mlp": 1.00003743, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7284168028801922, + "language_loss": 0.5927844, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61562008, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.183980703353882 + }, + { + "auxiliary_loss_clip": 0.01143316, + "auxiliary_loss_mlp": 0.0114038, + "balance_loss_clip": 1.00206637, + "balance_loss_mlp": 1.00114369, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3866999597632044, + "language_loss": 0.82008159, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84291857, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.6893112659454346 + }, + { + "auxiliary_loss_clip": 0.01142424, + "auxiliary_loss_mlp": 0.01141365, + "balance_loss_clip": 1.00204778, + "balance_loss_mlp": 1.00079393, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.5714604387846027, + "language_loss": 0.80384851, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82668638, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.6521589756011963 + }, + { + "auxiliary_loss_clip": 0.01141029, + "auxiliary_loss_mlp": 0.01127539, + "balance_loss_clip": 1.00294662, + "balance_loss_mlp": 1.00012827, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8765126256345851, + "language_loss": 0.62837845, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.65106416, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 3.022578716278076 + }, + { + "auxiliary_loss_clip": 0.0114212, + "auxiliary_loss_mlp": 0.01140471, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.0006628, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.471041549912818, + "language_loss": 0.73804224, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76086813, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.628063440322876 + }, + { + "auxiliary_loss_clip": 0.0113136, + "auxiliary_loss_mlp": 0.0114114, + "balance_loss_clip": 1.00243628, + "balance_loss_mlp": 1.00104487, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.8405268862012814, + "language_loss": 0.79232037, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81504542, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.646153450012207 + }, + { + "auxiliary_loss_clip": 0.01157665, + "auxiliary_loss_mlp": 0.00748337, + "balance_loss_clip": 1.00211239, + "balance_loss_mlp": 1.00071764, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 1.7802900898037906, + "language_loss": 0.71306062, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73212063, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.6364455223083496 + }, + { + "auxiliary_loss_clip": 0.01174257, + "auxiliary_loss_mlp": 0.01141674, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.0011977, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 2.135326188205712, + "language_loss": 0.82073694, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84389627, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.53257417678833 + }, + { + "auxiliary_loss_clip": 0.0114104, + "auxiliary_loss_mlp": 0.01140925, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00092614, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7095332583968632, + "language_loss": 0.75064683, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77346647, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.586670398712158 + }, + { + "auxiliary_loss_clip": 0.01157701, + "auxiliary_loss_mlp": 0.01141079, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00107932, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.9710935635152507, + "language_loss": 0.78317857, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80616641, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.6382415294647217 + }, + { + "auxiliary_loss_clip": 0.01141831, + "auxiliary_loss_mlp": 0.01140342, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00081944, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6682097107002642, + "language_loss": 0.73818898, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76101077, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.589090347290039 + }, + { + "auxiliary_loss_clip": 0.01142192, + "auxiliary_loss_mlp": 0.01141739, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00116777, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.2245608944633304, + "language_loss": 0.74451596, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76735526, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.5669138431549072 + }, + { + "auxiliary_loss_clip": 0.01126676, + "auxiliary_loss_mlp": 0.0114119, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00099981, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.5186837797487833, + "language_loss": 0.671067, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69374567, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.579972267150879 + }, + { + "auxiliary_loss_clip": 0.01140934, + "auxiliary_loss_mlp": 0.01141336, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00085974, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.6391321897072924, + "language_loss": 0.7262615, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74908423, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.6310081481933594 + }, + { + "auxiliary_loss_clip": 0.01158571, + "auxiliary_loss_mlp": 0.01140318, + "balance_loss_clip": 1.00217962, + "balance_loss_mlp": 1.00089097, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.928516609090624, + "language_loss": 0.80740398, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.83039284, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.552346706390381 + }, + { + "auxiliary_loss_clip": 0.01159208, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00095081, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.8776465878301505, + "language_loss": 0.79519671, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81820405, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.5570361614227295 + }, + { + "auxiliary_loss_clip": 0.01108741, + "auxiliary_loss_mlp": 0.00748265, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00061524, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 3.0006563927525045, + "language_loss": 0.82209325, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84066331, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.699737310409546 + }, + { + "auxiliary_loss_clip": 0.01174492, + "auxiliary_loss_mlp": 0.0114129, + "balance_loss_clip": 1.00236225, + "balance_loss_mlp": 1.00071895, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.7027530479251574, + "language_loss": 0.70380986, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72696769, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.517763137817383 + }, + { + "auxiliary_loss_clip": 0.01142507, + "auxiliary_loss_mlp": 0.01140815, + "balance_loss_clip": 1.00189781, + "balance_loss_mlp": 1.00110197, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.4676326559325847, + "language_loss": 0.78516918, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80800241, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.616447925567627 + }, + { + "auxiliary_loss_clip": 0.01142021, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00097251, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.749310848016499, + "language_loss": 0.92279613, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94562411, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.6146962642669678 + }, + { + "auxiliary_loss_clip": 0.01157746, + "auxiliary_loss_mlp": 0.01141345, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.0008688, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.595367037276969, + "language_loss": 0.88892567, + "learning_rate": 3.3598627783049e-06, + "loss": 0.91191661, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.5999739170074463 + }, + { + "auxiliary_loss_clip": 0.01157889, + "auxiliary_loss_mlp": 0.0114182, + "balance_loss_clip": 1.00237942, + "balance_loss_mlp": 1.00105786, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.046896901043176, + "language_loss": 0.78617418, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80917126, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.8814361095428467 + }, + { + "auxiliary_loss_clip": 0.01158961, + "auxiliary_loss_mlp": 0.01140765, + "balance_loss_clip": 1.00221646, + "balance_loss_mlp": 1.00095654, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 3.010954180280159, + "language_loss": 0.66361034, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68660754, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.572678565979004 + }, + { + "auxiliary_loss_clip": 0.01125458, + "auxiliary_loss_mlp": 0.01141549, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00097775, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 2.2800142324321016, + "language_loss": 0.76583827, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78850836, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.7940304279327393 + }, + { + "auxiliary_loss_clip": 0.01141038, + "auxiliary_loss_mlp": 0.01141461, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00098515, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.642593633998247, + "language_loss": 0.66964483, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.69246984, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.575517177581787 + }, + { + "auxiliary_loss_clip": 0.01141135, + "auxiliary_loss_mlp": 0.01142068, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00082886, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.6661440393363098, + "language_loss": 0.74736536, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.77019745, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.6593477725982666 + }, + { + "auxiliary_loss_clip": 0.0112475, + "auxiliary_loss_mlp": 0.01141431, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00085974, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 3.816123747832143, + "language_loss": 0.83720601, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85986787, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 4.2125749588012695 + }, + { + "auxiliary_loss_clip": 0.01157663, + "auxiliary_loss_mlp": 0.01141473, + "balance_loss_clip": 1.00235486, + "balance_loss_mlp": 1.00099707, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 2.1348615579950083, + "language_loss": 0.79473162, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81772298, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.5329902172088623 + }, + { + "auxiliary_loss_clip": 0.01174517, + "auxiliary_loss_mlp": 0.01141787, + "balance_loss_clip": 1.00232363, + "balance_loss_mlp": 1.00102437, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 8.371787785318928, + "language_loss": 0.70830435, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73146737, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.5176572799682617 + }, + { + "auxiliary_loss_clip": 0.01141554, + "auxiliary_loss_mlp": 0.01141036, + "balance_loss_clip": 1.00191605, + "balance_loss_mlp": 1.00055981, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8349043078811071, + "language_loss": 0.74134469, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76417059, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.5661306381225586 + }, + { + "auxiliary_loss_clip": 0.01140442, + "auxiliary_loss_mlp": 0.01141658, + "balance_loss_clip": 1.00199807, + "balance_loss_mlp": 1.00118196, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.6482927425263096, + "language_loss": 0.8005439, + "learning_rate": 3.357004373789946e-06, + "loss": 0.82336485, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 3.968590497970581 + }, + { + "auxiliary_loss_clip": 0.01174473, + "auxiliary_loss_mlp": 0.01141192, + "balance_loss_clip": 1.00232828, + "balance_loss_mlp": 1.0010016, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.0907096845532673, + "language_loss": 0.59926659, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62242323, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.5712826251983643 + }, + { + "auxiliary_loss_clip": 0.01159195, + "auxiliary_loss_mlp": 0.01140817, + "balance_loss_clip": 1.00230098, + "balance_loss_mlp": 1.00100875, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.6960075720507441, + "language_loss": 0.86588371, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88888383, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 3.948357582092285 + }, + { + "auxiliary_loss_clip": 0.01126436, + "auxiliary_loss_mlp": 0.01141209, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00082779, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 1.888277171566696, + "language_loss": 0.89850801, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92118448, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.589479684829712 + }, + { + "auxiliary_loss_clip": 0.01157596, + "auxiliary_loss_mlp": 0.01140685, + "balance_loss_clip": 1.0022682, + "balance_loss_mlp": 1.00125754, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.4401408447702535, + "language_loss": 0.72063363, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74361646, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.5936405658721924 + }, + { + "auxiliary_loss_clip": 0.01142288, + "auxiliary_loss_mlp": 0.01140911, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00091195, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 3.6102021436353984, + "language_loss": 0.77825224, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80108428, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.6900694370269775 + }, + { + "auxiliary_loss_clip": 0.01124617, + "auxiliary_loss_mlp": 0.01141949, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00090027, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 2.329925943851627, + "language_loss": 0.76368904, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78635478, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 4.043091773986816 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01142059, + "balance_loss_clip": 1.00224447, + "balance_loss_mlp": 1.00129652, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 2.372552829169364, + "language_loss": 0.57370102, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59686649, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.470493793487549 + }, + { + "auxiliary_loss_clip": 0.01126689, + "auxiliary_loss_mlp": 0.01141229, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00103939, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.2397626752573676, + "language_loss": 0.74836195, + "learning_rate": 3.354713944700797e-06, + "loss": 0.77104115, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.642850875854492 + }, + { + "auxiliary_loss_clip": 0.01157734, + "auxiliary_loss_mlp": 0.01141351, + "balance_loss_clip": 1.0023129, + "balance_loss_mlp": 1.00106537, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 3.107844698244196, + "language_loss": 0.77267253, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79566336, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.50508713722229 + }, + { + "auxiliary_loss_clip": 0.01157816, + "auxiliary_loss_mlp": 0.01140793, + "balance_loss_clip": 1.00223148, + "balance_loss_mlp": 1.00098419, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 3.239775961512538, + "language_loss": 0.82588714, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84887326, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.49531888961792 + }, + { + "auxiliary_loss_clip": 0.01108808, + "auxiliary_loss_mlp": 0.01141397, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00082564, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.902719197482284, + "language_loss": 0.79329503, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81579709, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.637917995452881 + }, + { + "auxiliary_loss_clip": 0.01157239, + "auxiliary_loss_mlp": 0.01128393, + "balance_loss_clip": 1.00438666, + "balance_loss_mlp": 1.00021958, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7684217563731591, + "language_loss": 0.60416037, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62701666, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.112361192703247 + }, + { + "auxiliary_loss_clip": 0.01174286, + "auxiliary_loss_mlp": 0.01141162, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00087643, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.0849785112020176, + "language_loss": 0.8064996, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.8296541, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.461214542388916 + }, + { + "auxiliary_loss_clip": 0.01158706, + "auxiliary_loss_mlp": 0.01141047, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.00104737, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.7223186752044202, + "language_loss": 0.70562005, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72861761, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.6084885597229004 + }, + { + "auxiliary_loss_clip": 0.01157652, + "auxiliary_loss_mlp": 0.01140108, + "balance_loss_clip": 1.0022347, + "balance_loss_mlp": 1.00077653, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.496629783223382, + "language_loss": 0.81747007, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84044766, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.6635584831237793 + }, + { + "auxiliary_loss_clip": 0.01174325, + "auxiliary_loss_mlp": 0.01140434, + "balance_loss_clip": 1.00228107, + "balance_loss_mlp": 1.00100684, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.6922894589556599, + "language_loss": 0.7981987, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82134628, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.6466352939605713 + }, + { + "auxiliary_loss_clip": 0.01159003, + "auxiliary_loss_mlp": 0.01140712, + "balance_loss_clip": 1.00220418, + "balance_loss_mlp": 1.00080836, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 2.066146363069153, + "language_loss": 0.78663814, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80963534, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.5403425693511963 + }, + { + "auxiliary_loss_clip": 0.01174441, + "auxiliary_loss_mlp": 0.01141217, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.0009315, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.2085833142189717, + "language_loss": 0.8937155, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91687202, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.4756386280059814 + }, + { + "auxiliary_loss_clip": 0.01157375, + "auxiliary_loss_mlp": 0.01140339, + "balance_loss_clip": 1.00211358, + "balance_loss_mlp": 1.00100756, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.7268906615628432, + "language_loss": 0.82344872, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84642589, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.666677236557007 + }, + { + "auxiliary_loss_clip": 0.01108575, + "auxiliary_loss_mlp": 0.0114037, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00103855, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4862805827752683, + "language_loss": 0.8399173, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86240673, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.7435927391052246 + }, + { + "auxiliary_loss_clip": 0.01126478, + "auxiliary_loss_mlp": 0.01127815, + "balance_loss_clip": 1.00347805, + "balance_loss_mlp": 1.00040436, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.861347133642758, + "language_loss": 0.60987866, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63242161, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.3909926414489746 + }, + { + "auxiliary_loss_clip": 0.01174299, + "auxiliary_loss_mlp": 0.01140253, + "balance_loss_clip": 1.00232089, + "balance_loss_mlp": 1.00082612, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.914276928345574, + "language_loss": 0.66218603, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68533152, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.5337181091308594 + }, + { + "auxiliary_loss_clip": 0.01157545, + "auxiliary_loss_mlp": 0.01141352, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.0009712, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.4711421335524286, + "language_loss": 0.62745714, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65044606, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.684903621673584 + }, + { + "auxiliary_loss_clip": 0.0115891, + "auxiliary_loss_mlp": 0.00748412, + "balance_loss_clip": 1.00227213, + "balance_loss_mlp": 1.00063646, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.6611582068006854, + "language_loss": 0.74282742, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.7619006, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.570053815841675 + }, + { + "auxiliary_loss_clip": 0.01140916, + "auxiliary_loss_mlp": 0.01139781, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00083089, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 2.4168529301597634, + "language_loss": 0.72789842, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.75070542, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.6141557693481445 + }, + { + "auxiliary_loss_clip": 0.01077736, + "auxiliary_loss_mlp": 0.01139985, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.00084376, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 4.1040563013645865, + "language_loss": 0.7422986, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76447582, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.7363109588623047 + }, + { + "auxiliary_loss_clip": 0.01124715, + "auxiliary_loss_mlp": 0.01140271, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00093937, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.4206281864388626, + "language_loss": 0.75800145, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78065133, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.6058669090270996 + }, + { + "auxiliary_loss_clip": 0.0112689, + "auxiliary_loss_mlp": 0.01140338, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00071967, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 4.997669089465671, + "language_loss": 0.77201056, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79468286, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.6440296173095703 + }, + { + "auxiliary_loss_clip": 0.01131697, + "auxiliary_loss_mlp": 0.01141065, + "balance_loss_clip": 1.00302994, + "balance_loss_mlp": 1.00106549, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 13.26495493127548, + "language_loss": 0.71030629, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73303396, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.6225173473358154 + }, + { + "auxiliary_loss_clip": 0.01157422, + "auxiliary_loss_mlp": 0.01139735, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00078464, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.4283728011198644, + "language_loss": 0.76050889, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78348041, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.626173734664917 + }, + { + "auxiliary_loss_clip": 0.01157326, + "auxiliary_loss_mlp": 0.01139491, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.0006361, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.7265407061294415, + "language_loss": 0.77877057, + "learning_rate": 3.348110666737214e-06, + "loss": 0.80173874, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.5887374877929688 + }, + { + "auxiliary_loss_clip": 0.01174214, + "auxiliary_loss_mlp": 0.01140854, + "balance_loss_clip": 1.00223875, + "balance_loss_mlp": 1.00104523, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 4.867436565503439, + "language_loss": 0.65438122, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.6775319, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.5425493717193604 + }, + { + "auxiliary_loss_clip": 0.01148558, + "auxiliary_loss_mlp": 0.01140625, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.00081623, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 2.7949186435511004, + "language_loss": 0.70715791, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73004973, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 4.03193736076355 + }, + { + "auxiliary_loss_clip": 0.01109282, + "auxiliary_loss_mlp": 0.01140719, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00091016, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.660277045503275, + "language_loss": 0.74805832, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.7705583, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.688758611679077 + }, + { + "auxiliary_loss_clip": 0.01114821, + "auxiliary_loss_mlp": 0.01140686, + "balance_loss_clip": 1.00248325, + "balance_loss_mlp": 1.00087738, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.8547001241886267, + "language_loss": 0.67719018, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69974524, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.70306396484375 + }, + { + "auxiliary_loss_clip": 0.01158749, + "auxiliary_loss_mlp": 0.01127584, + "balance_loss_clip": 1.00408578, + "balance_loss_mlp": 1.00017345, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7651186828031639, + "language_loss": 0.56892973, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.59179306, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.1237435340881348 + }, + { + "auxiliary_loss_clip": 0.0109341, + "auxiliary_loss_mlp": 0.00748459, + "balance_loss_clip": 1.00205064, + "balance_loss_mlp": 1.00079131, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.0963537279455164, + "language_loss": 0.83539617, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85381484, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.69081974029541 + }, + { + "auxiliary_loss_clip": 0.01174241, + "auxiliary_loss_mlp": 0.01140263, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00074077, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.2817287415518868, + "language_loss": 0.77758324, + "learning_rate": 3.34609559969027e-06, + "loss": 0.8007282, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 3.925774097442627 + }, + { + "auxiliary_loss_clip": 0.01146887, + "auxiliary_loss_mlp": 0.01140427, + "balance_loss_clip": 1.0026269, + "balance_loss_mlp": 1.00090456, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 2.1677705084540118, + "language_loss": 0.73495287, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75782597, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.5498974323272705 + }, + { + "auxiliary_loss_clip": 0.01158897, + "auxiliary_loss_mlp": 0.0114054, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00101757, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 8.570814440556102, + "language_loss": 0.88375825, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90675259, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 3.963351011276245 + }, + { + "auxiliary_loss_clip": 0.01163647, + "auxiliary_loss_mlp": 0.01140284, + "balance_loss_clip": 1.00272512, + "balance_loss_mlp": 1.00104713, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 2.0674846520445618, + "language_loss": 0.74181211, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76485145, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.536172866821289 + }, + { + "auxiliary_loss_clip": 0.01142637, + "auxiliary_loss_mlp": 0.01141039, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00113475, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 3.1137174591777796, + "language_loss": 0.80382729, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82666409, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.6020328998565674 + }, + { + "auxiliary_loss_clip": 0.01141618, + "auxiliary_loss_mlp": 0.01139966, + "balance_loss_clip": 1.00225639, + "balance_loss_mlp": 1.00092077, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.8551509369915025, + "language_loss": 0.73717058, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.7599864, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.59867262840271 + }, + { + "auxiliary_loss_clip": 0.01148341, + "auxiliary_loss_mlp": 0.01140928, + "balance_loss_clip": 1.00264907, + "balance_loss_mlp": 1.00092864, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.6419377047029813, + "language_loss": 0.76367927, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78657198, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 4.006023168563843 + }, + { + "auxiliary_loss_clip": 0.0112643, + "auxiliary_loss_mlp": 0.01139841, + "balance_loss_clip": 1.00218666, + "balance_loss_mlp": 1.00079501, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.6111778368024325, + "language_loss": 0.81457305, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83723569, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.6027238368988037 + }, + { + "auxiliary_loss_clip": 0.01115336, + "auxiliary_loss_mlp": 0.01140552, + "balance_loss_clip": 1.00236213, + "balance_loss_mlp": 1.00083852, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 4.473419514302385, + "language_loss": 0.86444449, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88700336, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.6606979370117188 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.0114088, + "balance_loss_clip": 1.00213778, + "balance_loss_mlp": 1.00116718, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.532222291513827, + "language_loss": 0.71467781, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73733068, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.6273036003112793 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01140675, + "balance_loss_clip": 1.00225818, + "balance_loss_mlp": 1.00105667, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 5.296003359255312, + "language_loss": 0.77069664, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79351449, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.628770351409912 + }, + { + "auxiliary_loss_clip": 0.01125335, + "auxiliary_loss_mlp": 0.01139924, + "balance_loss_clip": 1.00227499, + "balance_loss_mlp": 1.00078309, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.4438258952443725, + "language_loss": 0.75924778, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78190035, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.6950228214263916 + }, + { + "auxiliary_loss_clip": 0.01174245, + "auxiliary_loss_mlp": 0.01139656, + "balance_loss_clip": 1.00226521, + "balance_loss_mlp": 1.00089669, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.8931120816046434, + "language_loss": 0.83049208, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85363114, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.571913242340088 + }, + { + "auxiliary_loss_clip": 0.01125388, + "auxiliary_loss_mlp": 0.00748171, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00051284, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.7917214370320864, + "language_loss": 0.79618025, + "learning_rate": 3.342346699429516e-06, + "loss": 0.81491578, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.6189935207366943 + }, + { + "auxiliary_loss_clip": 0.01140814, + "auxiliary_loss_mlp": 0.01139871, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00082517, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.7510081687468995, + "language_loss": 0.83282626, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85563314, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.61873722076416 + }, + { + "auxiliary_loss_clip": 0.01109383, + "auxiliary_loss_mlp": 0.01140245, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00081766, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.7476818347174063, + "language_loss": 0.7362358, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75873208, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 2.7145705223083496 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01139484, + "balance_loss_clip": 1.00234032, + "balance_loss_mlp": 1.00082016, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.657307988023196, + "language_loss": 0.83608782, + "learning_rate": 3.341480346078704e-06, + "loss": 0.85905874, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.557072639465332 + }, + { + "auxiliary_loss_clip": 0.01159087, + "auxiliary_loss_mlp": 0.01140014, + "balance_loss_clip": 1.00231528, + "balance_loss_mlp": 1.00077796, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 2.0956612666498566, + "language_loss": 0.78205621, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80504715, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 2.565821647644043 + }, + { + "auxiliary_loss_clip": 0.01140898, + "auxiliary_loss_mlp": 0.01139789, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00083864, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.9242061257901137, + "language_loss": 0.70247042, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.7252773, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.5644917488098145 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01140297, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00077438, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.2393334348826643, + "language_loss": 0.79322588, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81572008, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.6779141426086426 + }, + { + "auxiliary_loss_clip": 0.01141846, + "auxiliary_loss_mlp": 0.01140133, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00080073, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6795624150704276, + "language_loss": 0.78127044, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80409026, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.7603752613067627 + }, + { + "auxiliary_loss_clip": 0.01157437, + "auxiliary_loss_mlp": 0.01139577, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00110388, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.1412850267209294, + "language_loss": 0.83261245, + "learning_rate": 3.340035406592074e-06, + "loss": 0.85558259, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.5398647785186768 + }, + { + "auxiliary_loss_clip": 0.01157315, + "auxiliary_loss_mlp": 0.01139204, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00092149, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 1.9720101598947133, + "language_loss": 0.74815202, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77111721, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.5613417625427246 + }, + { + "auxiliary_loss_clip": 0.01157559, + "auxiliary_loss_mlp": 0.01140101, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00067401, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 4.081920719049483, + "language_loss": 0.72616816, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74914473, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.561105966567993 + }, + { + "auxiliary_loss_clip": 0.01126513, + "auxiliary_loss_mlp": 0.00748209, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00064349, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 1.8583811505441639, + "language_loss": 0.7444433, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76319051, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.5929460525512695 + }, + { + "auxiliary_loss_clip": 0.01157681, + "auxiliary_loss_mlp": 0.01140236, + "balance_loss_clip": 1.00217068, + "balance_loss_mlp": 1.00099993, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 5.9635745525859996, + "language_loss": 0.66035616, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.6833353, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.554898977279663 + }, + { + "auxiliary_loss_clip": 0.01174298, + "auxiliary_loss_mlp": 0.01139625, + "balance_loss_clip": 1.00224686, + "balance_loss_mlp": 1.00086498, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.735923425851994, + "language_loss": 0.82114565, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84428489, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.538768768310547 + }, + { + "auxiliary_loss_clip": 0.01128235, + "auxiliary_loss_mlp": 0.01139581, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00082135, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.651299798490961, + "language_loss": 0.90795732, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93063545, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.6852362155914307 + }, + { + "auxiliary_loss_clip": 0.01147079, + "auxiliary_loss_mlp": 0.00748308, + "balance_loss_clip": 1.00249791, + "balance_loss_mlp": 1.00068831, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 1.9516323802246371, + "language_loss": 0.74388587, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76283973, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.7036006450653076 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.0112713, + "balance_loss_clip": 1.00452554, + "balance_loss_mlp": 1.00048184, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.779678911801959, + "language_loss": 0.62977111, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65231174, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.1849842071533203 + }, + { + "auxiliary_loss_clip": 0.01111106, + "auxiliary_loss_mlp": 0.01139465, + "balance_loss_clip": 1.0020889, + "balance_loss_mlp": 1.00070572, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 2.0994410721890904, + "language_loss": 0.70209551, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72460115, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.6806585788726807 + }, + { + "auxiliary_loss_clip": 0.01157342, + "auxiliary_loss_mlp": 0.01139678, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00072765, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.9276780685618475, + "language_loss": 0.68243349, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70540369, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.580942392349243 + }, + { + "auxiliary_loss_clip": 0.01157367, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00074983, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.7619737805583795, + "language_loss": 0.69720757, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.72017443, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.645569324493408 + }, + { + "auxiliary_loss_clip": 0.01142818, + "auxiliary_loss_mlp": 0.01139047, + "balance_loss_clip": 1.00206804, + "balance_loss_mlp": 1.0008601, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.4455644635603178, + "language_loss": 0.71428752, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7371062, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 4.212166786193848 + }, + { + "auxiliary_loss_clip": 0.01123788, + "auxiliary_loss_mlp": 0.01139782, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00083172, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.5856654460692992, + "language_loss": 0.8118856, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83452129, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.608476161956787 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01139152, + "balance_loss_clip": 1.00210655, + "balance_loss_mlp": 1.00086927, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.5445442097688835, + "language_loss": 0.78340626, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.8060503, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.6251168251037598 + }, + { + "auxiliary_loss_clip": 0.01109948, + "auxiliary_loss_mlp": 0.01139665, + "balance_loss_clip": 1.00170958, + "balance_loss_mlp": 1.0007143, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6253898230377604, + "language_loss": 0.78594381, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80843991, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.649106740951538 + }, + { + "auxiliary_loss_clip": 0.01108226, + "auxiliary_loss_mlp": 0.0113912, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00074244, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.583396053758955, + "language_loss": 0.76907557, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79154903, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 4.124852418899536 + }, + { + "auxiliary_loss_clip": 0.0115753, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_clip": 1.00214946, + "balance_loss_mlp": 1.0008235, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.337928747588073, + "language_loss": 0.7733075, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79627478, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.6013336181640625 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01127043, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00039518, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8394578598501883, + "language_loss": 0.60228682, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62477988, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 3.3651204109191895 + }, + { + "auxiliary_loss_clip": 0.01127258, + "auxiliary_loss_mlp": 0.01139979, + "balance_loss_clip": 1.00214207, + "balance_loss_mlp": 1.00083745, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8427611583733496, + "language_loss": 0.82042986, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84310222, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 4.056950330734253 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01139612, + "balance_loss_clip": 1.00171924, + "balance_loss_mlp": 1.00094736, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 2.30326812543292, + "language_loss": 0.72205544, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74452388, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.7827465534210205 + }, + { + "auxiliary_loss_clip": 0.01158805, + "auxiliary_loss_mlp": 0.01138753, + "balance_loss_clip": 1.00221348, + "balance_loss_mlp": 1.00094712, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.3908519017170344, + "language_loss": 0.70493472, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72791028, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.545435905456543 + }, + { + "auxiliary_loss_clip": 0.01142332, + "auxiliary_loss_mlp": 0.01140432, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00109982, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.4295803294056197, + "language_loss": 0.74496239, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76779002, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 3.947458267211914 + }, + { + "auxiliary_loss_clip": 0.01123955, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00098324, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.617880106209349, + "language_loss": 0.75931537, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78195328, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.657980442047119 + }, + { + "auxiliary_loss_clip": 0.01097214, + "auxiliary_loss_mlp": 0.01140469, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00094652, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 2.163465492969143, + "language_loss": 0.80212295, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82449985, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.641148805618286 + }, + { + "auxiliary_loss_clip": 0.01125755, + "auxiliary_loss_mlp": 0.01140776, + "balance_loss_clip": 1.00201285, + "balance_loss_mlp": 1.00077653, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6977227790707752, + "language_loss": 0.78982538, + "learning_rate": 3.332791681244776e-06, + "loss": 0.8124907, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.6070103645324707 + }, + { + "auxiliary_loss_clip": 0.01113106, + "auxiliary_loss_mlp": 0.01139711, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.0005697, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 1.9793277730495684, + "language_loss": 0.72548217, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74801034, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.633585214614868 + }, + { + "auxiliary_loss_clip": 0.01157609, + "auxiliary_loss_mlp": 0.01139178, + "balance_loss_clip": 1.00214994, + "balance_loss_mlp": 1.00080013, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 2.0537681311727005, + "language_loss": 0.7196399, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74260783, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.5812313556671143 + }, + { + "auxiliary_loss_clip": 0.01157634, + "auxiliary_loss_mlp": 0.0113965, + "balance_loss_clip": 1.00233889, + "balance_loss_mlp": 1.00098538, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 2.217565036806405, + "language_loss": 0.66262782, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68560064, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.529632329940796 + }, + { + "auxiliary_loss_clip": 0.0114219, + "auxiliary_loss_mlp": 0.01139049, + "balance_loss_clip": 1.00207174, + "balance_loss_mlp": 1.00076604, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 1.9535945043711864, + "language_loss": 0.80906773, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83188009, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.5865461826324463 + }, + { + "auxiliary_loss_clip": 0.01174126, + "auxiliary_loss_mlp": 0.01139779, + "balance_loss_clip": 1.00214458, + "balance_loss_mlp": 1.00092387, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.7110180244776905, + "language_loss": 0.72349751, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74663651, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01174388, + "auxiliary_loss_mlp": 0.0113944, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.00058532, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 1.907648093950535, + "language_loss": 0.72968167, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75281996, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.484673023223877 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01139076, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00079346, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 1.7582392116751007, + "language_loss": 0.6848923, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70785749, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.5424880981445312 + }, + { + "auxiliary_loss_clip": 0.01159176, + "auxiliary_loss_mlp": 0.01139045, + "balance_loss_clip": 1.00229633, + "balance_loss_mlp": 1.00076246, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.7162837906061716, + "language_loss": 0.8059513, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82893348, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.5201351642608643 + }, + { + "auxiliary_loss_clip": 0.01174269, + "auxiliary_loss_mlp": 0.01139133, + "balance_loss_clip": 1.00233722, + "balance_loss_mlp": 1.00094604, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 6.875497877462526, + "language_loss": 0.80203259, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82516658, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.5026140213012695 + }, + { + "auxiliary_loss_clip": 0.01143441, + "auxiliary_loss_mlp": 0.01138573, + "balance_loss_clip": 1.0022192, + "balance_loss_mlp": 1.00067163, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.530739930439105, + "language_loss": 0.82716262, + "learning_rate": 3.329885337055249e-06, + "loss": 0.8499828, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.614363431930542 + }, + { + "auxiliary_loss_clip": 0.01157406, + "auxiliary_loss_mlp": 0.01139135, + "balance_loss_clip": 1.00218439, + "balance_loss_mlp": 1.00085211, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.6123740410873473, + "language_loss": 0.79194951, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.814915, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.5165553092956543 + }, + { + "auxiliary_loss_clip": 0.01174131, + "auxiliary_loss_mlp": 0.01139011, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00072777, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.9170008394000997, + "language_loss": 0.74796861, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.77110004, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.5562257766723633 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.01138127, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00070286, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6819020485344245, + "language_loss": 0.76077235, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.5806424617767334 + }, + { + "auxiliary_loss_clip": 0.01142108, + "auxiliary_loss_mlp": 0.01138061, + "balance_loss_clip": 1.00196397, + "balance_loss_mlp": 1.0006361, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.986574216690217, + "language_loss": 0.64694279, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66974449, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.5598886013031006 + }, + { + "auxiliary_loss_clip": 0.01146542, + "auxiliary_loss_mlp": 0.01137535, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.00077868, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5761188680053702, + "language_loss": 0.72045851, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.74329931, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.5958244800567627 + }, + { + "auxiliary_loss_clip": 0.01142964, + "auxiliary_loss_mlp": 0.01137454, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.00079215, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.7741857436633446, + "language_loss": 0.79686755, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81967175, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.6197328567504883 + }, + { + "auxiliary_loss_clip": 0.01126511, + "auxiliary_loss_mlp": 0.01138429, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00090897, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6727672692884445, + "language_loss": 0.80499464, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82764411, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.5760467052459717 + }, + { + "auxiliary_loss_clip": 0.01140956, + "auxiliary_loss_mlp": 0.01138224, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00070453, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.799519102942459, + "language_loss": 0.67303765, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69582951, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.716217041015625 + }, + { + "auxiliary_loss_clip": 0.01174053, + "auxiliary_loss_mlp": 0.00748143, + "balance_loss_clip": 1.00211537, + "balance_loss_mlp": 1.00061464, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 2.513619339124458, + "language_loss": 0.71576113, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73498309, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.5271430015563965 + }, + { + "auxiliary_loss_clip": 0.01173941, + "auxiliary_loss_mlp": 0.01138481, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00067508, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.8751710597405926, + "language_loss": 0.75810897, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78123325, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.6264963150024414 + }, + { + "auxiliary_loss_clip": 0.01125558, + "auxiliary_loss_mlp": 0.01138018, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00068939, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.7967801063834417, + "language_loss": 0.6009168, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62355256, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.697237491607666 + }, + { + "auxiliary_loss_clip": 0.01140322, + "auxiliary_loss_mlp": 0.01137848, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00080538, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.4485063510226521, + "language_loss": 0.71434259, + "learning_rate": 3.326391068322232e-06, + "loss": 0.7371242, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.660576820373535 + }, + { + "auxiliary_loss_clip": 0.01158732, + "auxiliary_loss_mlp": 0.01138135, + "balance_loss_clip": 1.00215626, + "balance_loss_mlp": 1.00071061, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5655299393233268, + "language_loss": 0.73337138, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75634009, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.5581259727478027 + }, + { + "auxiliary_loss_clip": 0.01113543, + "auxiliary_loss_mlp": 0.01138227, + "balance_loss_clip": 1.0022161, + "balance_loss_mlp": 1.00080252, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.305559062119577, + "language_loss": 0.58147281, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60399044, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 4.195500135421753 + }, + { + "auxiliary_loss_clip": 0.01157444, + "auxiliary_loss_mlp": 0.01138023, + "balance_loss_clip": 1.00229549, + "balance_loss_mlp": 1.00059807, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 1.8167799283026829, + "language_loss": 0.86479372, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88774836, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.5871665477752686 + }, + { + "auxiliary_loss_clip": 0.01140783, + "auxiliary_loss_mlp": 0.01139084, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00080156, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7989592027451742, + "language_loss": 0.67355645, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.6963551, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.5825235843658447 + }, + { + "auxiliary_loss_clip": 0.01146639, + "auxiliary_loss_mlp": 0.01137929, + "balance_loss_clip": 1.0024122, + "balance_loss_mlp": 1.00069511, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 2.0100230802059316, + "language_loss": 0.70475119, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72759688, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.583510160446167 + }, + { + "auxiliary_loss_clip": 0.01157355, + "auxiliary_loss_mlp": 0.01138184, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00066459, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.5449560226395016, + "language_loss": 0.73938537, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76234078, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.5600478649139404 + }, + { + "auxiliary_loss_clip": 0.01158494, + "auxiliary_loss_mlp": 0.01137986, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00065672, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.0017334325872125, + "language_loss": 0.77083206, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.7937969, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 3.958034038543701 + }, + { + "auxiliary_loss_clip": 0.01158898, + "auxiliary_loss_mlp": 0.01138846, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.00084901, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7962080432695189, + "language_loss": 0.78846431, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.81144172, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.535592794418335 + }, + { + "auxiliary_loss_clip": 0.01140371, + "auxiliary_loss_mlp": 0.01138406, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00079048, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 2.7564623008996394, + "language_loss": 0.75740206, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78018987, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 4.052216053009033 + }, + { + "auxiliary_loss_clip": 0.01158268, + "auxiliary_loss_mlp": 0.0113799, + "balance_loss_clip": 1.00219154, + "balance_loss_mlp": 1.0007565, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.386140244453486, + "language_loss": 0.77482677, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79778934, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.595374345779419 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01138473, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00076199, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.550728911211385, + "language_loss": 0.78207105, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80485761, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.601860523223877 + }, + { + "auxiliary_loss_clip": 0.01123712, + "auxiliary_loss_mlp": 0.01138112, + "balance_loss_clip": 1.00196135, + "balance_loss_mlp": 1.00078273, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.9593286112749815, + "language_loss": 0.8855058, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90812409, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.6109068393707275 + }, + { + "auxiliary_loss_clip": 0.01157522, + "auxiliary_loss_mlp": 0.0113781, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00095773, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 2.9757987532847814, + "language_loss": 0.86392826, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88688159, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 3.968441963195801 + }, + { + "auxiliary_loss_clip": 0.01163117, + "auxiliary_loss_mlp": 0.01127074, + "balance_loss_clip": 1.00405407, + "balance_loss_mlp": 1.00042653, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.7976031377217663, + "language_loss": 0.60148782, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62438977, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.241638422012329 + }, + { + "auxiliary_loss_clip": 0.01141854, + "auxiliary_loss_mlp": 0.00748072, + "balance_loss_clip": 1.00195038, + "balance_loss_mlp": 1.00050664, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.9607846085073615, + "language_loss": 0.68707919, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70597845, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.618443489074707 + }, + { + "auxiliary_loss_clip": 0.01158797, + "auxiliary_loss_mlp": 0.00747888, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00043988, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.8066929458587377, + "language_loss": 0.84307456, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86214143, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.610055923461914 + }, + { + "auxiliary_loss_clip": 0.01174127, + "auxiliary_loss_mlp": 0.01138595, + "balance_loss_clip": 1.00234175, + "balance_loss_mlp": 1.00088477, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.7666389456367428, + "language_loss": 0.77742422, + "learning_rate": 3.321428460652342e-06, + "loss": 0.80055153, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.5712132453918457 + }, + { + "auxiliary_loss_clip": 0.0111385, + "auxiliary_loss_mlp": 0.01138313, + "balance_loss_clip": 1.00210345, + "balance_loss_mlp": 1.00069773, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 4.521227428636121, + "language_loss": 0.68556547, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70808709, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.6593101024627686 + }, + { + "auxiliary_loss_clip": 0.01140356, + "auxiliary_loss_mlp": 0.01138572, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00086188, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.253828663723912, + "language_loss": 0.74930465, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77209389, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.681140422821045 + }, + { + "auxiliary_loss_clip": 0.01158727, + "auxiliary_loss_mlp": 0.01138234, + "balance_loss_clip": 1.00229394, + "balance_loss_mlp": 1.00090528, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.6380823373660962, + "language_loss": 0.91732073, + "learning_rate": 3.320551201545832e-06, + "loss": 0.94029027, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.511812925338745 + }, + { + "auxiliary_loss_clip": 0.01158763, + "auxiliary_loss_mlp": 0.01137548, + "balance_loss_clip": 1.0021199, + "balance_loss_mlp": 1.00069594, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.4173522296574346, + "language_loss": 0.73625195, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75921512, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.5118160247802734 + }, + { + "auxiliary_loss_clip": 0.01081058, + "auxiliary_loss_mlp": 0.01137448, + "balance_loss_clip": 1.00176823, + "balance_loss_mlp": 1.00078678, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 2.0214402996454273, + "language_loss": 0.77679294, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79897797, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.7300593852996826 + }, + { + "auxiliary_loss_clip": 0.01127913, + "auxiliary_loss_mlp": 0.01139035, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00094247, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.9322664885513663, + "language_loss": 0.81748474, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84015417, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.653099536895752 + }, + { + "auxiliary_loss_clip": 0.01095734, + "auxiliary_loss_mlp": 0.01137928, + "balance_loss_clip": 1.00203562, + "balance_loss_mlp": 1.00069451, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.9624400714002659, + "language_loss": 0.84814072, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87047732, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.6920952796936035 + }, + { + "auxiliary_loss_clip": 0.0114198, + "auxiliary_loss_mlp": 0.01138265, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00074553, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 2.0532075777307814, + "language_loss": 0.75721955, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.78002203, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.690842628479004 + }, + { + "auxiliary_loss_clip": 0.01090469, + "auxiliary_loss_mlp": 0.0113775, + "balance_loss_clip": 1.00151372, + "balance_loss_mlp": 1.00089777, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 2.043160587577593, + "language_loss": 0.73384774, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75612986, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.693002223968506 + }, + { + "auxiliary_loss_clip": 0.01107828, + "auxiliary_loss_mlp": 0.01137736, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00059795, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3324900179425858, + "language_loss": 0.74733061, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76978624, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.6561853885650635 + }, + { + "auxiliary_loss_clip": 0.01142083, + "auxiliary_loss_mlp": 0.0113753, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00067735, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.8001910001067272, + "language_loss": 0.76457727, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78737336, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.6211912631988525 + }, + { + "auxiliary_loss_clip": 0.01158837, + "auxiliary_loss_mlp": 0.01138933, + "balance_loss_clip": 1.00229633, + "balance_loss_mlp": 1.00093627, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.0105244712952373, + "language_loss": 0.67731458, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.70029223, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.5485689640045166 + }, + { + "auxiliary_loss_clip": 0.01143211, + "auxiliary_loss_mlp": 0.01138326, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00071108, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.890697773294884, + "language_loss": 0.77116299, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79397839, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.6555838584899902 + }, + { + "auxiliary_loss_clip": 0.01091564, + "auxiliary_loss_mlp": 0.01138937, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00065398, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.047779566150845, + "language_loss": 0.73011494, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75241995, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.7087976932525635 + }, + { + "auxiliary_loss_clip": 0.01157112, + "auxiliary_loss_mlp": 0.0113853, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00091445, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.9276055955779723, + "language_loss": 0.78266108, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80561757, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.551380157470703 + }, + { + "auxiliary_loss_clip": 0.01108946, + "auxiliary_loss_mlp": 0.01139162, + "balance_loss_clip": 1.00203264, + "balance_loss_mlp": 1.00087941, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.04887085123622, + "language_loss": 0.77266145, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79514253, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.6296982765197754 + }, + { + "auxiliary_loss_clip": 0.01158793, + "auxiliary_loss_mlp": 0.01138254, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00073433, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.7198560935272351, + "language_loss": 0.68899649, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71196693, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.5223968029022217 + }, + { + "auxiliary_loss_clip": 0.01158726, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.0008626, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 1.919479531551281, + "language_loss": 0.82339549, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84636277, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.5098154544830322 + }, + { + "auxiliary_loss_clip": 0.01158362, + "auxiliary_loss_mlp": 0.01138849, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00056672, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 2.277927059950223, + "language_loss": 0.68301022, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70598245, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.5160775184631348 + }, + { + "auxiliary_loss_clip": 0.0113157, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_clip": 1.00268853, + "balance_loss_mlp": 1.00075126, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.7767026229113911, + "language_loss": 0.74110907, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.76381224, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.730954170227051 + }, + { + "auxiliary_loss_clip": 0.01126635, + "auxiliary_loss_mlp": 0.00747901, + "balance_loss_clip": 1.00208247, + "balance_loss_mlp": 1.0005157, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.428759162699035, + "language_loss": 0.66179848, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.6805439, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.693002939224243 + }, + { + "auxiliary_loss_clip": 0.01158861, + "auxiliary_loss_mlp": 0.01138184, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00075984, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.2462336947194226, + "language_loss": 0.70640081, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72937131, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.5679917335510254 + }, + { + "auxiliary_loss_clip": 0.01142174, + "auxiliary_loss_mlp": 0.00747928, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.00041425, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.2893334618841124, + "language_loss": 0.83672023, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85562122, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 4.07778525352478 + }, + { + "auxiliary_loss_clip": 0.01174076, + "auxiliary_loss_mlp": 0.01138705, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00070834, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.3340094905896396, + "language_loss": 0.71898824, + "learning_rate": 3.314397785576548e-06, + "loss": 0.74211603, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.512157678604126 + }, + { + "auxiliary_loss_clip": 0.01140601, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00065732, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 9.215070172554322, + "language_loss": 0.92375767, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94654644, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.5869240760803223 + }, + { + "auxiliary_loss_clip": 0.01158715, + "auxiliary_loss_mlp": 0.01138476, + "balance_loss_clip": 1.0023073, + "balance_loss_mlp": 1.00066972, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.4887888070633837, + "language_loss": 0.72837609, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75134802, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.5782744884490967 + }, + { + "auxiliary_loss_clip": 0.01158886, + "auxiliary_loss_mlp": 0.01138391, + "balance_loss_clip": 1.00219917, + "balance_loss_mlp": 1.00068057, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 5.049246518329062, + "language_loss": 0.85110658, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87407941, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 3.9179489612579346 + }, + { + "auxiliary_loss_clip": 0.01125148, + "auxiliary_loss_mlp": 0.01137798, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00065947, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.3818482341841243, + "language_loss": 0.77132648, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79395592, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.6450870037078857 + }, + { + "auxiliary_loss_clip": 0.01140722, + "auxiliary_loss_mlp": 0.01137814, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.00086677, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1589719919000188, + "language_loss": 0.80060953, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.82339489, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.6365063190460205 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.01138072, + "balance_loss_clip": 1.00216746, + "balance_loss_mlp": 1.00064766, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.471519341914317, + "language_loss": 0.55481791, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57760876, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 4.189180135726929 + }, + { + "auxiliary_loss_clip": 0.01157385, + "auxiliary_loss_mlp": 0.01138553, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00074697, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.7914352731037886, + "language_loss": 0.84711218, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.87007153, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.5420761108398438 + }, + { + "auxiliary_loss_clip": 0.0115739, + "auxiliary_loss_mlp": 0.01138264, + "balance_loss_clip": 1.00221598, + "balance_loss_mlp": 1.00074399, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.7963095497056447, + "language_loss": 0.73077691, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.75373346, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.5356662273406982 + }, + { + "auxiliary_loss_clip": 0.01173955, + "auxiliary_loss_mlp": 0.01138232, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00090241, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.7182001119088925, + "language_loss": 0.77124935, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79437125, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 3.866459608078003 + }, + { + "auxiliary_loss_clip": 0.01173975, + "auxiliary_loss_mlp": 0.0113795, + "balance_loss_clip": 1.00219965, + "balance_loss_mlp": 1.00071597, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.784164844735794, + "language_loss": 0.78617746, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80929673, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.5501773357391357 + }, + { + "auxiliary_loss_clip": 0.01123884, + "auxiliary_loss_mlp": 0.01138249, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00072956, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 2.0045251079459803, + "language_loss": 0.85158205, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87420332, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.6907033920288086 + }, + { + "auxiliary_loss_clip": 0.01157299, + "auxiliary_loss_mlp": 0.0113853, + "balance_loss_clip": 1.00213206, + "balance_loss_mlp": 1.0008198, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.0990353222563836, + "language_loss": 0.90424359, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92720181, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.546675443649292 + }, + { + "auxiliary_loss_clip": 0.01157351, + "auxiliary_loss_mlp": 0.01138371, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00066018, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 2.9832717136903026, + "language_loss": 0.86434078, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88729799, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.5871741771698 + }, + { + "auxiliary_loss_clip": 0.01158162, + "auxiliary_loss_mlp": 0.01138458, + "balance_loss_clip": 1.00214064, + "balance_loss_mlp": 1.00084305, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.6145193142988823, + "language_loss": 0.73504108, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75800729, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 2.5685203075408936 + }, + { + "auxiliary_loss_clip": 0.01158642, + "auxiliary_loss_mlp": 0.01138738, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00074172, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 3.0106778606548494, + "language_loss": 0.74373698, + "learning_rate": 3.309989025093813e-06, + "loss": 0.76671076, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.564488410949707 + }, + { + "auxiliary_loss_clip": 0.01157797, + "auxiliary_loss_mlp": 0.0113916, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.00078201, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.4094049384924383, + "language_loss": 0.70767277, + "learning_rate": 3.309694709912618e-06, + "loss": 0.73064232, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.551104784011841 + }, + { + "auxiliary_loss_clip": 0.01141523, + "auxiliary_loss_mlp": 0.00747999, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00054145, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 2.1020689063990314, + "language_loss": 0.78855503, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80745018, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.6079061031341553 + }, + { + "auxiliary_loss_clip": 0.01143303, + "auxiliary_loss_mlp": 0.01138082, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00075293, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 2.414904059473817, + "language_loss": 0.80864656, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.83146036, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.5807130336761475 + }, + { + "auxiliary_loss_clip": 0.011403, + "auxiliary_loss_mlp": 0.01137289, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00072253, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.021873953683788, + "language_loss": 0.58029568, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60307157, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.628831386566162 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01138177, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00094342, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.8253892395921156, + "language_loss": 0.75580919, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77859616, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.644968032836914 + }, + { + "auxiliary_loss_clip": 0.01141825, + "auxiliary_loss_mlp": 0.01138873, + "balance_loss_clip": 1.00202072, + "balance_loss_mlp": 1.00078106, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.7776438923941233, + "language_loss": 0.62418294, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64698994, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.7373714447021484 + }, + { + "auxiliary_loss_clip": 0.01157157, + "auxiliary_loss_mlp": 0.0113841, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.00079525, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.7272736277123864, + "language_loss": 0.73489749, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75785315, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.600337505340576 + }, + { + "auxiliary_loss_clip": 0.01126864, + "auxiliary_loss_mlp": 0.01138138, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00061834, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.64032261094371, + "language_loss": 0.81346273, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83611286, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.6608083248138428 + }, + { + "auxiliary_loss_clip": 0.01116003, + "auxiliary_loss_mlp": 0.01138001, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00076747, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 3.221736692330743, + "language_loss": 0.87826288, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90080285, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.6745710372924805 + }, + { + "auxiliary_loss_clip": 0.01173968, + "auxiliary_loss_mlp": 0.01138966, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.0008744, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 3.0460887467159234, + "language_loss": 0.81528175, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83841109, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.4953978061676025 + }, + { + "auxiliary_loss_clip": 0.01172989, + "auxiliary_loss_mlp": 0.01126733, + "balance_loss_clip": 1.0034188, + "balance_loss_mlp": 1.00084829, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7928111897449607, + "language_loss": 0.57285094, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59584814, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 2.9715871810913086 + }, + { + "auxiliary_loss_clip": 0.01157145, + "auxiliary_loss_mlp": 0.0074804, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00065875, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.4408914915038897, + "language_loss": 0.86374903, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88280082, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.565713882446289 + }, + { + "auxiliary_loss_clip": 0.01156974, + "auxiliary_loss_mlp": 0.01137634, + "balance_loss_clip": 1.00196302, + "balance_loss_mlp": 1.00078154, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.9240751856588216, + "language_loss": 0.72829223, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75123823, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.54287052154541 + }, + { + "auxiliary_loss_clip": 0.01157223, + "auxiliary_loss_mlp": 0.01137927, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00078821, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.5763749708886063, + "language_loss": 0.89605689, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91900837, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.5313549041748047 + }, + { + "auxiliary_loss_clip": 0.01142831, + "auxiliary_loss_mlp": 0.0113814, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00100172, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 6.484565309888852, + "language_loss": 0.83040464, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85321438, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.621293783187866 + }, + { + "auxiliary_loss_clip": 0.01173771, + "auxiliary_loss_mlp": 0.01138143, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00090897, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.8939039895096808, + "language_loss": 0.76851106, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79163015, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.5095152854919434 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01137592, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00073934, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.6935600717779327, + "language_loss": 0.81637239, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83915991, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.7662010192871094 + }, + { + "auxiliary_loss_clip": 0.01076564, + "auxiliary_loss_mlp": 0.01139346, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00077748, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 2.6774566742718378, + "language_loss": 0.85136646, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.87352562, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.7604222297668457 + }, + { + "auxiliary_loss_clip": 0.0115855, + "auxiliary_loss_mlp": 0.01137682, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00073409, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 1.9876519734470395, + "language_loss": 0.69818878, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72115111, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.5648460388183594 + }, + { + "auxiliary_loss_clip": 0.01141736, + "auxiliary_loss_mlp": 0.01137957, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00062776, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.054785525279124, + "language_loss": 0.90786767, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.9306646, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.597792148590088 + }, + { + "auxiliary_loss_clip": 0.01173789, + "auxiliary_loss_mlp": 0.0113803, + "balance_loss_clip": 1.00212538, + "balance_loss_mlp": 1.00070095, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 2.0983358715066815, + "language_loss": 0.72893012, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75204825, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.599569320678711 + }, + { + "auxiliary_loss_clip": 0.01141678, + "auxiliary_loss_mlp": 0.01137943, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00080502, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.9463066648136746, + "language_loss": 0.76396954, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78676575, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.5728540420532227 + }, + { + "auxiliary_loss_clip": 0.01140984, + "auxiliary_loss_mlp": 0.01139176, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00098825, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.803253653294243, + "language_loss": 0.68782878, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71063036, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 4.111106634140015 + }, + { + "auxiliary_loss_clip": 0.01140416, + "auxiliary_loss_mlp": 0.0113842, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00089979, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 2.2204686156851623, + "language_loss": 0.74606717, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76885557, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.5539908409118652 + }, + { + "auxiliary_loss_clip": 0.01173967, + "auxiliary_loss_mlp": 0.00748094, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.00065136, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.835023191272414, + "language_loss": 0.76656598, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78578657, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.567035436630249 + }, + { + "auxiliary_loss_clip": 0.01140977, + "auxiliary_loss_mlp": 0.01138493, + "balance_loss_clip": 1.00184429, + "balance_loss_mlp": 1.00097358, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.7966667469833308, + "language_loss": 0.86135936, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88415402, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 4.125729560852051 + }, + { + "auxiliary_loss_clip": 0.01158073, + "auxiliary_loss_mlp": 0.01137305, + "balance_loss_clip": 1.00204766, + "balance_loss_mlp": 1.0007391, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.6918452095864172, + "language_loss": 0.82052475, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84347856, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.5745890140533447 + }, + { + "auxiliary_loss_clip": 0.01111073, + "auxiliary_loss_mlp": 0.01137864, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.000916, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.3088717426658723, + "language_loss": 0.85856068, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88104999, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.6410226821899414 + }, + { + "auxiliary_loss_clip": 0.01125199, + "auxiliary_loss_mlp": 0.01137525, + "balance_loss_clip": 1.00197542, + "balance_loss_mlp": 1.00067317, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.282460291667732, + "language_loss": 0.85666567, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.87929291, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 4.012114524841309 + }, + { + "auxiliary_loss_clip": 0.01157767, + "auxiliary_loss_mlp": 0.01137382, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00091136, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.9032784154290225, + "language_loss": 0.80579805, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82874954, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.5136489868164062 + }, + { + "auxiliary_loss_clip": 0.01141499, + "auxiliary_loss_mlp": 0.01138595, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00088453, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.7838164505588234, + "language_loss": 0.72673059, + "learning_rate": 3.300842211064773e-06, + "loss": 0.74953151, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.623746395111084 + }, + { + "auxiliary_loss_clip": 0.01143081, + "auxiliary_loss_mlp": 0.01138427, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00109839, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.2900666011272146, + "language_loss": 0.72090399, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.7437191, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 3.9387974739074707 + }, + { + "auxiliary_loss_clip": 0.01124274, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_clip": 1.00232553, + "balance_loss_mlp": 1.00037372, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8214821029925002, + "language_loss": 0.60704166, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62953937, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.1486423015594482 + }, + { + "auxiliary_loss_clip": 0.01092313, + "auxiliary_loss_mlp": 0.01126241, + "balance_loss_clip": 1.00244284, + "balance_loss_mlp": 1.0003556, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7445482790956148, + "language_loss": 0.52386892, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54605448, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.266634464263916 + }, + { + "auxiliary_loss_clip": 0.01158406, + "auxiliary_loss_mlp": 0.01138094, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00076437, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.853826266767287, + "language_loss": 0.81665689, + "learning_rate": 3.299658516973972e-06, + "loss": 0.8396219, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.7780239582061768 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.01137488, + "balance_loss_clip": 1.00195038, + "balance_loss_mlp": 1.00082684, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.6995169078553913, + "language_loss": 0.75001431, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77265573, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.672347068786621 + }, + { + "auxiliary_loss_clip": 0.01142256, + "auxiliary_loss_mlp": 0.01138338, + "balance_loss_clip": 1.0019058, + "balance_loss_mlp": 1.00100923, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 2.8916074922591553, + "language_loss": 0.62168062, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64448655, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.567826271057129 + }, + { + "auxiliary_loss_clip": 0.01158807, + "auxiliary_loss_mlp": 0.01138188, + "balance_loss_clip": 1.00224757, + "balance_loss_mlp": 1.00076377, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4225089700192155, + "language_loss": 0.79480654, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81777656, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.6046981811523438 + }, + { + "auxiliary_loss_clip": 0.011069, + "auxiliary_loss_mlp": 0.01138468, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00075793, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.873586301340774, + "language_loss": 0.73976386, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76221752, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.7882323265075684 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01137858, + "balance_loss_clip": 1.00167108, + "balance_loss_mlp": 1.00071931, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.7546196057571135, + "language_loss": 0.78319454, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80564547, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.6772055625915527 + }, + { + "auxiliary_loss_clip": 0.01141455, + "auxiliary_loss_mlp": 0.01138914, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00082207, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 1.8315328730675517, + "language_loss": 0.76928347, + "learning_rate": 3.297881497566964e-06, + "loss": 0.7920872, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.537503957748413 + }, + { + "auxiliary_loss_clip": 0.01128135, + "auxiliary_loss_mlp": 0.01138084, + "balance_loss_clip": 1.00204933, + "balance_loss_mlp": 1.00075531, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 2.5433863781521007, + "language_loss": 0.78329831, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80596054, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.6636228561401367 + }, + { + "auxiliary_loss_clip": 0.0114193, + "auxiliary_loss_mlp": 0.01138884, + "balance_loss_clip": 1.00215602, + "balance_loss_mlp": 1.00079179, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.564897494460864, + "language_loss": 0.75208485, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77489293, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.5954368114471436 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_clip": 1.00212741, + "balance_loss_mlp": 1.00083447, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.447281703192842, + "language_loss": 0.7417835, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.76474577, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.7361881732940674 + }, + { + "auxiliary_loss_clip": 0.01125092, + "auxiliary_loss_mlp": 0.01139059, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.0009675, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 2.530131243266589, + "language_loss": 0.70481384, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72745532, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.7031631469726562 + }, + { + "auxiliary_loss_clip": 0.01141425, + "auxiliary_loss_mlp": 0.0113841, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.0006994, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.45912928214754, + "language_loss": 0.80056596, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82336426, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.593891143798828 + }, + { + "auxiliary_loss_clip": 0.01141697, + "auxiliary_loss_mlp": 0.01137445, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00078309, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.399733354617972, + "language_loss": 0.82819927, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85099065, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.621062994003296 + }, + { + "auxiliary_loss_clip": 0.01107801, + "auxiliary_loss_mlp": 0.01137927, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.0006938, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.7548194714929848, + "language_loss": 0.66809213, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69054943, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.6566035747528076 + }, + { + "auxiliary_loss_clip": 0.01158489, + "auxiliary_loss_mlp": 0.00748387, + "balance_loss_clip": 1.00223982, + "balance_loss_mlp": 1.00083828, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7062329422833287, + "language_loss": 0.73742992, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75649869, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.623918294906616 + }, + { + "auxiliary_loss_clip": 0.01123176, + "auxiliary_loss_mlp": 0.01138493, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00078201, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.394403303245122, + "language_loss": 0.73130834, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75392497, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.690279722213745 + }, + { + "auxiliary_loss_clip": 0.01173785, + "auxiliary_loss_mlp": 0.01137668, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00062537, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.5711631482727815, + "language_loss": 0.8396672, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.8627817, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.5677595138549805 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01137925, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00069118, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 3.4558383342710544, + "language_loss": 0.71287012, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73581886, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.5918471813201904 + }, + { + "auxiliary_loss_clip": 0.01107418, + "auxiliary_loss_mlp": 0.01136989, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.0007093, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 13.862570679717845, + "language_loss": 0.82611799, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84856212, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.6611804962158203 + }, + { + "auxiliary_loss_clip": 0.01141363, + "auxiliary_loss_mlp": 0.01138003, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00067401, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 3.3771013968241026, + "language_loss": 0.74321538, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76600909, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.603695869445801 + }, + { + "auxiliary_loss_clip": 0.01077557, + "auxiliary_loss_mlp": 0.01137804, + "balance_loss_clip": 1.00166833, + "balance_loss_mlp": 1.00076127, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.72805699140673, + "language_loss": 0.8356787, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85783231, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.7409088611602783 + }, + { + "auxiliary_loss_clip": 0.01143089, + "auxiliary_loss_mlp": 0.01138182, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00066209, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.1280380235528606, + "language_loss": 0.74305463, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76586729, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.56219744682312 + }, + { + "auxiliary_loss_clip": 0.01173639, + "auxiliary_loss_mlp": 0.01137559, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00061131, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.8288814943899292, + "language_loss": 0.76082039, + "learning_rate": 3.293134123765452e-06, + "loss": 0.78393239, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.4833738803863525 + }, + { + "auxiliary_loss_clip": 0.01108682, + "auxiliary_loss_mlp": 0.01138657, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.00066006, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 1.797296850406773, + "language_loss": 0.72570026, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74817365, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.639211654663086 + }, + { + "auxiliary_loss_clip": 0.01157051, + "auxiliary_loss_mlp": 0.01139059, + "balance_loss_clip": 1.00201428, + "balance_loss_mlp": 1.00058556, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.8118816239872826, + "language_loss": 0.79069537, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81365645, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 3.97463059425354 + }, + { + "auxiliary_loss_clip": 0.01158533, + "auxiliary_loss_mlp": 0.01137972, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00064278, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.528879345725746, + "language_loss": 0.70473075, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72769582, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.542963743209839 + }, + { + "auxiliary_loss_clip": 0.01124895, + "auxiliary_loss_mlp": 0.01138506, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00079513, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.5148884564234992, + "language_loss": 0.7909162, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81355023, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.619570255279541 + }, + { + "auxiliary_loss_clip": 0.01158372, + "auxiliary_loss_mlp": 0.01138566, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00066459, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 2.4129884222711393, + "language_loss": 0.79398894, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81695831, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.601654052734375 + }, + { + "auxiliary_loss_clip": 0.01129202, + "auxiliary_loss_mlp": 0.01138593, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00078654, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.7107872830994126, + "language_loss": 0.73763639, + "learning_rate": 3.291350619752129e-06, + "loss": 0.76031435, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.5789077281951904 + }, + { + "auxiliary_loss_clip": 0.01158461, + "auxiliary_loss_mlp": 0.01138461, + "balance_loss_clip": 1.00206649, + "balance_loss_mlp": 1.00065541, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 3.4166962773937137, + "language_loss": 0.61829317, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64126229, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 4.023039102554321 + }, + { + "auxiliary_loss_clip": 0.01157122, + "auxiliary_loss_mlp": 0.01138906, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.00090909, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.8377439976800474, + "language_loss": 0.83361059, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85657084, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.5446925163269043 + }, + { + "auxiliary_loss_clip": 0.01124029, + "auxiliary_loss_mlp": 0.01138174, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00065446, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.1886117885057033, + "language_loss": 0.67237628, + "learning_rate": 3.290458206523322e-06, + "loss": 0.69499826, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.5865519046783447 + }, + { + "auxiliary_loss_clip": 0.01156834, + "auxiliary_loss_mlp": 0.01137495, + "balance_loss_clip": 1.00196731, + "balance_loss_mlp": 1.00054693, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.8293002261436724, + "language_loss": 0.71354723, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73649049, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 3.917545795440674 + }, + { + "auxiliary_loss_clip": 0.01173838, + "auxiliary_loss_mlp": 0.01138882, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00098109, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 1.8145002684543838, + "language_loss": 0.66095978, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68408698, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.5033464431762695 + }, + { + "auxiliary_loss_clip": 0.01173905, + "auxiliary_loss_mlp": 0.0113868, + "balance_loss_clip": 1.00226259, + "balance_loss_mlp": 1.00077891, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.2348758796385844, + "language_loss": 0.74176824, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76489413, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 3.8945248126983643 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01138432, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00062561, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9144845924055274, + "language_loss": 0.71633458, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73915023, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.552536964416504 + }, + { + "auxiliary_loss_clip": 0.01158664, + "auxiliary_loss_mlp": 0.01138529, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.00053263, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.8212759742514668, + "language_loss": 0.76442254, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.7873944, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.6099226474761963 + }, + { + "auxiliary_loss_clip": 0.0117372, + "auxiliary_loss_mlp": 0.0113777, + "balance_loss_clip": 1.00213337, + "balance_loss_mlp": 1.00072742, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.6500398181011708, + "language_loss": 0.69872206, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72183698, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.530681610107422 + }, + { + "auxiliary_loss_clip": 0.0115724, + "auxiliary_loss_mlp": 0.01139395, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00063574, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.2016105628069256, + "language_loss": 0.85405838, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87702477, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.52317214012146 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01138286, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00086117, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 2.324380679673995, + "language_loss": 0.79444349, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81724155, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.579149007797241 + }, + { + "auxiliary_loss_clip": 0.01173728, + "auxiliary_loss_mlp": 0.01138096, + "balance_loss_clip": 1.00215364, + "balance_loss_mlp": 1.00095737, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 2.2836064382749885, + "language_loss": 0.85607183, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87919009, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.4631779193878174 + }, + { + "auxiliary_loss_clip": 0.01141698, + "auxiliary_loss_mlp": 0.01138076, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00065184, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.741457808253383, + "language_loss": 0.77702808, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79982579, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.5393896102905273 + }, + { + "auxiliary_loss_clip": 0.01141695, + "auxiliary_loss_mlp": 0.00748203, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00077391, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.7419472489422765, + "language_loss": 0.72259486, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74149382, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.6479685306549072 + }, + { + "auxiliary_loss_clip": 0.01157878, + "auxiliary_loss_mlp": 0.01137421, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.00066376, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.444280679052843, + "language_loss": 0.75874925, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78170228, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.629709005355835 + }, + { + "auxiliary_loss_clip": 0.01156973, + "auxiliary_loss_mlp": 0.01137884, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00084114, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.4652950083826157, + "language_loss": 0.86420918, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.8871578, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.507016897201538 + }, + { + "auxiliary_loss_clip": 0.01140065, + "auxiliary_loss_mlp": 0.01137859, + "balance_loss_clip": 1.00203621, + "balance_loss_mlp": 1.00081587, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.8672423181942388, + "language_loss": 0.68505967, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70783889, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.5971195697784424 + }, + { + "auxiliary_loss_clip": 0.01141893, + "auxiliary_loss_mlp": 0.01138766, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00057912, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 2.0348923223579, + "language_loss": 0.75675958, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.77956617, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.5794684886932373 + }, + { + "auxiliary_loss_clip": 0.0111418, + "auxiliary_loss_mlp": 0.01138721, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00072396, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.74846697606839, + "language_loss": 0.68342716, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70595622, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.7419612407684326 + }, + { + "auxiliary_loss_clip": 0.01156809, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00065517, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.885708641989563, + "language_loss": 0.73678917, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75583899, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.577897787094116 + }, + { + "auxiliary_loss_clip": 0.01156838, + "auxiliary_loss_mlp": 0.01138771, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00086951, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.713336564875239, + "language_loss": 0.86358166, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88653773, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.552802801132202 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01138914, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00063157, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.120111724994346, + "language_loss": 0.86809707, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.89094973, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.5358991622924805 + }, + { + "auxiliary_loss_clip": 0.01157296, + "auxiliary_loss_mlp": 0.01137859, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00100708, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.7513352754827698, + "language_loss": 0.78805447, + "learning_rate": 3.284497544825668e-06, + "loss": 0.81100595, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.5773935317993164 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01138503, + "balance_loss_clip": 1.00224757, + "balance_loss_mlp": 1.00069666, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.8023777347320724, + "language_loss": 0.78600377, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80880976, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.6174354553222656 + }, + { + "auxiliary_loss_clip": 0.0110935, + "auxiliary_loss_mlp": 0.01139234, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00076056, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 2.0906004273722774, + "language_loss": 0.71663231, + "learning_rate": 3.283900405580837e-06, + "loss": 0.7391181, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.9551262855529785 + }, + { + "auxiliary_loss_clip": 0.01140142, + "auxiliary_loss_mlp": 0.01138793, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00089145, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 2.1477887750882427, + "language_loss": 0.73156124, + "learning_rate": 3.283601762924312e-06, + "loss": 0.7543506, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.5868825912475586 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.000844, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.641887608243508, + "language_loss": 0.80192029, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82472205, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.5889992713928223 + }, + { + "auxiliary_loss_clip": 0.01147412, + "auxiliary_loss_mlp": 0.00748214, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00054312, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.6403632963550316, + "language_loss": 0.70324022, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72219646, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.621776819229126 + }, + { + "auxiliary_loss_clip": 0.01125416, + "auxiliary_loss_mlp": 0.01138453, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.0010283, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.9967313043320882, + "language_loss": 0.85795677, + "learning_rate": 3.282705542954199e-06, + "loss": 0.88059545, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.622821092605591 + }, + { + "auxiliary_loss_clip": 0.01156618, + "auxiliary_loss_mlp": 0.01138134, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00061393, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.7156896689200083, + "language_loss": 0.67003894, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69298643, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.6459708213806152 + }, + { + "auxiliary_loss_clip": 0.01141885, + "auxiliary_loss_mlp": 0.01138788, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.0006001, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 2.08608273350338, + "language_loss": 0.79107094, + "learning_rate": 3.28210781975363e-06, + "loss": 0.8138777, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.6136906147003174 + }, + { + "auxiliary_loss_clip": 0.01173755, + "auxiliary_loss_mlp": 0.01137963, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00063372, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 3.087382358391289, + "language_loss": 0.8249023, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84801948, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.5458295345306396 + }, + { + "auxiliary_loss_clip": 0.01108135, + "auxiliary_loss_mlp": 0.01139371, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00108778, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 6.10275164597607, + "language_loss": 0.86082351, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88329858, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.6912710666656494 + }, + { + "auxiliary_loss_clip": 0.01131042, + "auxiliary_loss_mlp": 0.01138324, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00070882, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5763436480493003, + "language_loss": 0.81088877, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83358246, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 4.068438768386841 + }, + { + "auxiliary_loss_clip": 0.01146153, + "auxiliary_loss_mlp": 0.01137792, + "balance_loss_clip": 1.00209486, + "balance_loss_mlp": 1.00065374, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.579871233762397, + "language_loss": 0.67047846, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69331789, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.774428129196167 + }, + { + "auxiliary_loss_clip": 0.01140248, + "auxiliary_loss_mlp": 0.01137681, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00073361, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 2.2517721912786177, + "language_loss": 0.75685143, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77963078, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.5813968181610107 + }, + { + "auxiliary_loss_clip": 0.01157448, + "auxiliary_loss_mlp": 0.01137877, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.000929, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 2.2290556615213455, + "language_loss": 0.78188717, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.80484045, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.545435667037964 + }, + { + "auxiliary_loss_clip": 0.01173657, + "auxiliary_loss_mlp": 0.01137867, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00072837, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5479996158988611, + "language_loss": 0.73427594, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75739121, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.531052827835083 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01138022, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.0008837, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.4985388797072985, + "language_loss": 0.75680059, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77976096, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 3.917983293533325 + }, + { + "auxiliary_loss_clip": 0.01173441, + "auxiliary_loss_mlp": 0.01137449, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00088251, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.774303140452093, + "language_loss": 0.81572211, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83883107, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.459162950515747 + }, + { + "auxiliary_loss_clip": 0.01157075, + "auxiliary_loss_mlp": 0.01138302, + "balance_loss_clip": 1.00205994, + "balance_loss_mlp": 1.00078177, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.9740378284046853, + "language_loss": 0.80482125, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82777506, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.552359104156494 + }, + { + "auxiliary_loss_clip": 0.01107708, + "auxiliary_loss_mlp": 0.01138623, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00072122, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.7300218479165919, + "language_loss": 0.70767796, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73014134, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 4.043479681015015 + }, + { + "auxiliary_loss_clip": 0.01124469, + "auxiliary_loss_mlp": 0.01138123, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00088918, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.777098746456598, + "language_loss": 0.70714122, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72976714, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.685002088546753 + }, + { + "auxiliary_loss_clip": 0.01140232, + "auxiliary_loss_mlp": 0.01138438, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00091791, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 3.691292231383687, + "language_loss": 0.81785506, + "learning_rate": 3.278217882782715e-06, + "loss": 0.8406418, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 3.916046619415283 + }, + { + "auxiliary_loss_clip": 0.01156859, + "auxiliary_loss_mlp": 0.01137706, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.0008533, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.6128428785274656, + "language_loss": 0.74673587, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.76968151, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.553830146789551 + }, + { + "auxiliary_loss_clip": 0.01124692, + "auxiliary_loss_mlp": 0.00748118, + "balance_loss_clip": 1.00160992, + "balance_loss_mlp": 1.00065827, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 1.9252863967380693, + "language_loss": 0.71161169, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73033977, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.6557488441467285 + }, + { + "auxiliary_loss_clip": 0.01156907, + "auxiliary_loss_mlp": 0.01137579, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00063109, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 3.007750426181323, + "language_loss": 0.76167607, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78462094, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.5396547317504883 + }, + { + "auxiliary_loss_clip": 0.01156906, + "auxiliary_loss_mlp": 0.01137896, + "balance_loss_clip": 1.00205266, + "balance_loss_mlp": 1.0008533, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7926116098878293, + "language_loss": 0.84714496, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87009305, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.566305637359619 + }, + { + "auxiliary_loss_clip": 0.01158449, + "auxiliary_loss_mlp": 0.0113824, + "balance_loss_clip": 1.00205052, + "balance_loss_mlp": 1.00062442, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.991519379826449, + "language_loss": 0.84044921, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86341608, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.512683868408203 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01137482, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.0006299, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 5.828744472531583, + "language_loss": 0.85369903, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87632167, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.6483066082000732 + }, + { + "auxiliary_loss_clip": 0.01142539, + "auxiliary_loss_mlp": 0.01138374, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.0009501, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 1.886957167380091, + "language_loss": 0.72428691, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74709606, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.5727992057800293 + }, + { + "auxiliary_loss_clip": 0.01156764, + "auxiliary_loss_mlp": 0.01137231, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00076032, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.054096365629408, + "language_loss": 0.87456083, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89750075, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.522925615310669 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01138106, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00058627, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 1.9248289806650971, + "language_loss": 0.8258779, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.84868789, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.5547266006469727 + }, + { + "auxiliary_loss_clip": 0.01124435, + "auxiliary_loss_mlp": 0.0113775, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00070739, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6101793011450731, + "language_loss": 0.68587768, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70849955, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.6603095531463623 + }, + { + "auxiliary_loss_clip": 0.01141483, + "auxiliary_loss_mlp": 0.01138016, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00078273, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.17337689218573, + "language_loss": 0.74664533, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76944029, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.6198761463165283 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_clip": 1.00203812, + "balance_loss_mlp": 1.00062728, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.4681165780471728, + "language_loss": 0.65179801, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67475522, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.7362828254699707 + }, + { + "auxiliary_loss_clip": 0.0113248, + "auxiliary_loss_mlp": 0.0113787, + "balance_loss_clip": 1.00227046, + "balance_loss_mlp": 1.0009222, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.318986266807277, + "language_loss": 0.68233633, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.7050398, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.631361484527588 + }, + { + "auxiliary_loss_clip": 0.01173387, + "auxiliary_loss_mlp": 0.01136538, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00073528, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9228418583538618, + "language_loss": 0.79238874, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81548798, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.496791362762451 + }, + { + "auxiliary_loss_clip": 0.01140265, + "auxiliary_loss_mlp": 0.01137234, + "balance_loss_clip": 1.00175798, + "balance_loss_mlp": 1.00076294, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.227799960137137, + "language_loss": 0.69668585, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71946084, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.581300973892212 + }, + { + "auxiliary_loss_clip": 0.01173708, + "auxiliary_loss_mlp": 0.01138144, + "balance_loss_clip": 1.00204837, + "balance_loss_mlp": 1.00081468, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9070860630090076, + "language_loss": 0.78491682, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80803537, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.488649845123291 + }, + { + "auxiliary_loss_clip": 0.0115797, + "auxiliary_loss_mlp": 0.01137584, + "balance_loss_clip": 1.0020318, + "balance_loss_mlp": 1.0008266, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.1135435246501806, + "language_loss": 0.76211226, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78506774, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.520629644393921 + }, + { + "auxiliary_loss_clip": 0.01173526, + "auxiliary_loss_mlp": 0.01137544, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00097835, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.8267504561908645, + "language_loss": 0.70004857, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72315931, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.454906463623047 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.0113769, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00064647, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.9861600489437967, + "language_loss": 0.71587133, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73866302, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.576768636703491 + }, + { + "auxiliary_loss_clip": 0.01156834, + "auxiliary_loss_mlp": 0.01137209, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00064254, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 2.0406108345057254, + "language_loss": 0.73988426, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76282465, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.5704452991485596 + }, + { + "auxiliary_loss_clip": 0.0115764, + "auxiliary_loss_mlp": 0.01136872, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00087762, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4483859969505641, + "language_loss": 0.67073482, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69367993, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.552400827407837 + }, + { + "auxiliary_loss_clip": 0.01158272, + "auxiliary_loss_mlp": 0.01136759, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00085998, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 2.497492429095927, + "language_loss": 0.85220587, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87515616, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.536226272583008 + }, + { + "auxiliary_loss_clip": 0.01140843, + "auxiliary_loss_mlp": 0.01137295, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00082421, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.7508754122389454, + "language_loss": 0.78536981, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80815119, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.6407783031463623 + }, + { + "auxiliary_loss_clip": 0.01141461, + "auxiliary_loss_mlp": 0.01137451, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00078988, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.8471980196815467, + "language_loss": 0.77130878, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79409796, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.6781201362609863 + }, + { + "auxiliary_loss_clip": 0.01126149, + "auxiliary_loss_mlp": 0.01137761, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00071764, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.9393027833104193, + "language_loss": 0.82266653, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84530568, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.616539716720581 + }, + { + "auxiliary_loss_clip": 0.01109659, + "auxiliary_loss_mlp": 0.00748058, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00059938, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 2.275844106662597, + "language_loss": 0.69948685, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71806401, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.685471534729004 + }, + { + "auxiliary_loss_clip": 0.01158296, + "auxiliary_loss_mlp": 0.01137692, + "balance_loss_clip": 1.0020715, + "balance_loss_mlp": 1.00064909, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.7557125248915764, + "language_loss": 0.82569909, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84865892, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.6056435108184814 + }, + { + "auxiliary_loss_clip": 0.0112531, + "auxiliary_loss_mlp": 0.01138434, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00081861, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.009673568497292, + "language_loss": 0.7376129, + "learning_rate": 3.269811767783906e-06, + "loss": 0.76025033, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 4.080660581588745 + }, + { + "auxiliary_loss_clip": 0.01156708, + "auxiliary_loss_mlp": 0.01136872, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00087786, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.6650272880529289, + "language_loss": 0.74216616, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76510197, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.572700262069702 + }, + { + "auxiliary_loss_clip": 0.01173587, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00060749, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 2.44923589777149, + "language_loss": 0.72275996, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74587035, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.5531086921691895 + }, + { + "auxiliary_loss_clip": 0.0115825, + "auxiliary_loss_mlp": 0.01136541, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00064206, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 2.1174006194176385, + "language_loss": 0.87740552, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.90035343, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.577580451965332 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01136993, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00080824, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.3395951091771883, + "language_loss": 0.77688074, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79949844, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 4.066823959350586 + }, + { + "auxiliary_loss_clip": 0.01130405, + "auxiliary_loss_mlp": 0.01138089, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.0007596, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 3.048598710145636, + "language_loss": 0.77556199, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79824692, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.6004161834716797 + }, + { + "auxiliary_loss_clip": 0.01140995, + "auxiliary_loss_mlp": 0.01137141, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00076556, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 2.9406932233102925, + "language_loss": 0.74120021, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76398164, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.80419659614563 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.0074814, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00081122, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 2.1610858993999886, + "language_loss": 0.7998879, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81910324, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.574737548828125 + }, + { + "auxiliary_loss_clip": 0.01141725, + "auxiliary_loss_mlp": 0.01137213, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00083804, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 2.1113299192597674, + "language_loss": 0.82010734, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8428967, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 3.9716708660125732 + }, + { + "auxiliary_loss_clip": 0.01108297, + "auxiliary_loss_mlp": 0.01123731, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00013435, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7597325639641322, + "language_loss": 0.5946362, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61695647, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 4.686215400695801 + }, + { + "auxiliary_loss_clip": 0.0117366, + "auxiliary_loss_mlp": 0.01137982, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.00084352, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.7090690461945555, + "language_loss": 0.71606272, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73917913, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.493161916732788 + }, + { + "auxiliary_loss_clip": 0.01107597, + "auxiliary_loss_mlp": 0.01136309, + "balance_loss_clip": 1.00165308, + "balance_loss_mlp": 1.00050545, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.0594803359980456, + "language_loss": 0.6975978, + "learning_rate": 3.266499023140606e-06, + "loss": 0.7200368, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.663015127182007 + }, + { + "auxiliary_loss_clip": 0.01156659, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00049806, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3887492565064035, + "language_loss": 0.77311629, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79605258, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.568404197692871 + }, + { + "auxiliary_loss_clip": 0.0117352, + "auxiliary_loss_mlp": 0.00748144, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00081158, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.6419632128900636, + "language_loss": 0.72710192, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74631858, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.5477054119110107 + }, + { + "auxiliary_loss_clip": 0.01162779, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00059581, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.8218978346383035, + "language_loss": 0.81074971, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83375013, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.554663896560669 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01137769, + "balance_loss_clip": 1.00163293, + "balance_loss_mlp": 1.00082183, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 2.2951265337240656, + "language_loss": 0.7210598, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74353462, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.6666910648345947 + }, + { + "auxiliary_loss_clip": 0.01139945, + "auxiliary_loss_mlp": 0.0113657, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.00067186, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.6411000933031967, + "language_loss": 0.75051564, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77328074, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.571925640106201 + }, + { + "auxiliary_loss_clip": 0.01157862, + "auxiliary_loss_mlp": 0.01137048, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00076795, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 3.143002880861267, + "language_loss": 0.81986821, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84281731, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.590867280960083 + }, + { + "auxiliary_loss_clip": 0.01125121, + "auxiliary_loss_mlp": 0.01137318, + "balance_loss_clip": 1.00171542, + "balance_loss_mlp": 1.00075221, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 6.012918744763482, + "language_loss": 0.74348164, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.76610601, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.5987730026245117 + }, + { + "auxiliary_loss_clip": 0.01093062, + "auxiliary_loss_mlp": 0.0074818, + "balance_loss_clip": 1.00179279, + "balance_loss_mlp": 1.00082207, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.8647268513627824, + "language_loss": 0.76870131, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78711373, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.7359633445739746 + }, + { + "auxiliary_loss_clip": 0.01173628, + "auxiliary_loss_mlp": 0.01137574, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.0010078, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.9417694678219912, + "language_loss": 0.83064348, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.85375547, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.5014190673828125 + }, + { + "auxiliary_loss_clip": 0.01139591, + "auxiliary_loss_mlp": 0.011372, + "balance_loss_clip": 1.00170934, + "balance_loss_mlp": 1.00072908, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.4837357982550692, + "language_loss": 0.70849079, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73125869, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.5305044651031494 + }, + { + "auxiliary_loss_clip": 0.01173493, + "auxiliary_loss_mlp": 0.01136885, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00079608, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.9973663854845656, + "language_loss": 0.69652772, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71963149, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.5427544116973877 + }, + { + "auxiliary_loss_clip": 0.0114073, + "auxiliary_loss_mlp": 0.01137702, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 1.00075436, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 1.918517478062231, + "language_loss": 0.68041933, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70320362, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.592911720275879 + }, + { + "auxiliary_loss_clip": 0.01141382, + "auxiliary_loss_mlp": 0.01137211, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00083554, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.555736855770236, + "language_loss": 0.82730091, + "learning_rate": 3.262576470461507e-06, + "loss": 0.85008681, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.5760200023651123 + }, + { + "auxiliary_loss_clip": 0.01140525, + "auxiliary_loss_mlp": 0.01136958, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00077331, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 2.083968395386181, + "language_loss": 0.8884837, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91125852, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.5768349170684814 + }, + { + "auxiliary_loss_clip": 0.01125017, + "auxiliary_loss_mlp": 0.01137394, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00082779, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 3.224878906480607, + "language_loss": 0.71369278, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73631692, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.645071506500244 + }, + { + "auxiliary_loss_clip": 0.01095313, + "auxiliary_loss_mlp": 0.01137444, + "balance_loss_clip": 1.00187433, + "balance_loss_mlp": 1.00097346, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.8261368803574505, + "language_loss": 0.72505659, + "learning_rate": 3.26167011603268e-06, + "loss": 0.74738419, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.7592248916625977 + }, + { + "auxiliary_loss_clip": 0.01173444, + "auxiliary_loss_mlp": 0.01137073, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.00079322, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 8.499411499324601, + "language_loss": 0.77077556, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79388076, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.6013035774230957 + }, + { + "auxiliary_loss_clip": 0.01126731, + "auxiliary_loss_mlp": 0.01137525, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00076783, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.594470259226514, + "language_loss": 0.81805825, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84070081, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.592411518096924 + }, + { + "auxiliary_loss_clip": 0.01173334, + "auxiliary_loss_mlp": 0.01136322, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00061429, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.6745723910071408, + "language_loss": 0.74787927, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.77097583, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.526639461517334 + }, + { + "auxiliary_loss_clip": 0.01156551, + "auxiliary_loss_mlp": 0.00748036, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00075388, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.6710600433415272, + "language_loss": 0.84036267, + "learning_rate": 3.26046097371721e-06, + "loss": 0.8594085, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.5446393489837646 + }, + { + "auxiliary_loss_clip": 0.01156187, + "auxiliary_loss_mlp": 0.01137132, + "balance_loss_clip": 1.00172472, + "balance_loss_mlp": 1.00066161, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.856857571268785, + "language_loss": 0.75637794, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.77931106, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.5518674850463867 + }, + { + "auxiliary_loss_clip": 0.01141915, + "auxiliary_loss_mlp": 0.01138542, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00092721, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 3.3089369573989478, + "language_loss": 0.62277502, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64557958, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6543774604797363 + }, + { + "auxiliary_loss_clip": 0.01141575, + "auxiliary_loss_mlp": 0.01138008, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00086987, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.9664386297078889, + "language_loss": 0.82665849, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.8494544, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.526794672012329 + }, + { + "auxiliary_loss_clip": 0.01173342, + "auxiliary_loss_mlp": 0.01136528, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00081992, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 1.7528828316246499, + "language_loss": 0.63251871, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65561736, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.5089385509490967 + }, + { + "auxiliary_loss_clip": 0.01157641, + "auxiliary_loss_mlp": 0.01137137, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00066638, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.7606795373036896, + "language_loss": 0.74968469, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77263242, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.550947666168213 + }, + { + "auxiliary_loss_clip": 0.01131663, + "auxiliary_loss_mlp": 0.01136959, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00086951, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 2.0587169022298673, + "language_loss": 0.7547881, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77747428, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 4.020958662033081 + }, + { + "auxiliary_loss_clip": 0.01173655, + "auxiliary_loss_mlp": 0.00748166, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00085592, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.9985669793415874, + "language_loss": 0.81668597, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83590412, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.5840651988983154 + }, + { + "auxiliary_loss_clip": 0.01142821, + "auxiliary_loss_mlp": 0.01137138, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00066686, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5841447237625101, + "language_loss": 0.75998473, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78278428, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.5911715030670166 + }, + { + "auxiliary_loss_clip": 0.01124607, + "auxiliary_loss_mlp": 0.01137164, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00097966, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.258408285354508, + "language_loss": 0.70658547, + "learning_rate": 3.257737608512723e-06, + "loss": 0.72920316, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.6300582885742188 + }, + { + "auxiliary_loss_clip": 0.0115698, + "auxiliary_loss_mlp": 0.01137801, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00085306, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 4.456715031775742, + "language_loss": 0.76179934, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78474718, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.5115745067596436 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01136827, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00064278, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.9867298916467389, + "language_loss": 0.7458058, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76857406, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.642112970352173 + }, + { + "auxiliary_loss_clip": 0.0117375, + "auxiliary_loss_mlp": 0.0113849, + "balance_loss_clip": 1.00210953, + "balance_loss_mlp": 1.00068378, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.537328504958491, + "language_loss": 0.75519842, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77832079, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 3.889594554901123 + }, + { + "auxiliary_loss_clip": 0.01147478, + "auxiliary_loss_mlp": 0.01137622, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00086522, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.7032012901093094, + "language_loss": 0.79567528, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81852627, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.5730552673339844 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01136425, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00062156, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.5860801046493873, + "language_loss": 0.74523664, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76768982, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.6220874786376953 + }, + { + "auxiliary_loss_clip": 0.01109483, + "auxiliary_loss_mlp": 0.01136544, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00083601, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.8643213055688839, + "language_loss": 0.67201626, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69447649, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.633784294128418 + }, + { + "auxiliary_loss_clip": 0.01158197, + "auxiliary_loss_mlp": 0.01136644, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00074577, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 2.090500216307653, + "language_loss": 0.79740232, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82035077, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 3.966320276260376 + }, + { + "auxiliary_loss_clip": 0.0115674, + "auxiliary_loss_mlp": 0.00748111, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00078726, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 3.080580491882986, + "language_loss": 0.80878401, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82783246, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.575270414352417 + }, + { + "auxiliary_loss_clip": 0.01156743, + "auxiliary_loss_mlp": 0.01137063, + "balance_loss_clip": 1.00196016, + "balance_loss_mlp": 1.00078344, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.661483578647861, + "language_loss": 0.7192654, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74220347, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.6310293674468994 + }, + { + "auxiliary_loss_clip": 0.01156633, + "auxiliary_loss_mlp": 0.01137037, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00085235, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9393035671430385, + "language_loss": 0.72742325, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75035995, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.6018571853637695 + }, + { + "auxiliary_loss_clip": 0.01142404, + "auxiliary_loss_mlp": 0.00748091, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00066483, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.8473838181684197, + "language_loss": 0.71179283, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73069775, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.560889482498169 + }, + { + "auxiliary_loss_clip": 0.01126066, + "auxiliary_loss_mlp": 0.01137384, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00053144, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.1954746431515715, + "language_loss": 0.78705919, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80969357, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.5942113399505615 + }, + { + "auxiliary_loss_clip": 0.01173424, + "auxiliary_loss_mlp": 0.01136584, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00058961, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.580268531882426, + "language_loss": 0.78097332, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80407333, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.5534896850585938 + }, + { + "auxiliary_loss_clip": 0.01143038, + "auxiliary_loss_mlp": 0.01137209, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.00083375, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 2.3694162839548834, + "language_loss": 0.76652563, + "learning_rate": 3.253493587064563e-06, + "loss": 0.7893281, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.5752670764923096 + }, + { + "auxiliary_loss_clip": 0.01156647, + "auxiliary_loss_mlp": 0.01136996, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.0008111, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 1.8945165833965536, + "language_loss": 0.72324049, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74617696, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 2.6051323413848877 + }, + { + "auxiliary_loss_clip": 0.01158319, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.0008173, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 8.818829944901168, + "language_loss": 0.79298913, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81595004, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.502718210220337 + }, + { + "auxiliary_loss_clip": 0.01140045, + "auxiliary_loss_mlp": 0.0113726, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00059867, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.8408463682407907, + "language_loss": 0.77167594, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79444897, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.60097336769104 + }, + { + "auxiliary_loss_clip": 0.01141531, + "auxiliary_loss_mlp": 0.01137705, + "balance_loss_clip": 1.001912, + "balance_loss_mlp": 1.00075769, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 2.114141789190878, + "language_loss": 0.76670778, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78950012, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.6406350135803223 + }, + { + "auxiliary_loss_clip": 0.01079191, + "auxiliary_loss_mlp": 0.01136811, + "balance_loss_clip": 1.0016253, + "balance_loss_mlp": 1.00062656, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 2.6332994833052887, + "language_loss": 0.7204529, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74261296, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.7258455753326416 + }, + { + "auxiliary_loss_clip": 0.01140398, + "auxiliary_loss_mlp": 0.01137673, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00072515, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 2.2112599180677033, + "language_loss": 0.82379925, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84657991, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.615854501724243 + }, + { + "auxiliary_loss_clip": 0.01173401, + "auxiliary_loss_mlp": 0.00748204, + "balance_loss_clip": 1.00200784, + "balance_loss_mlp": 1.00083148, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.7086198099294758, + "language_loss": 0.74808538, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.76730144, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.536627769470215 + }, + { + "auxiliary_loss_clip": 0.01139638, + "auxiliary_loss_mlp": 0.01136512, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00061321, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.0525845681122017, + "language_loss": 0.75775647, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78051805, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.5725107192993164 + }, + { + "auxiliary_loss_clip": 0.01156806, + "auxiliary_loss_mlp": 0.01136916, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.00073123, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7336878641821978, + "language_loss": 0.80528355, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82822073, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.595263719558716 + }, + { + "auxiliary_loss_clip": 0.01156874, + "auxiliary_loss_mlp": 0.01137469, + "balance_loss_clip": 1.00198877, + "balance_loss_mlp": 1.00061655, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 1.970673412257502, + "language_loss": 0.81823635, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84117985, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.5351061820983887 + }, + { + "auxiliary_loss_clip": 0.01173424, + "auxiliary_loss_mlp": 0.01136838, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00084364, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 2.234034288846302, + "language_loss": 0.78268087, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80578345, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.506784439086914 + }, + { + "auxiliary_loss_clip": 0.0111284, + "auxiliary_loss_mlp": 0.01136166, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00064886, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 3.257792874855107, + "language_loss": 0.83552182, + "learning_rate": 3.249848438115917e-06, + "loss": 0.8580119, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.710258960723877 + }, + { + "auxiliary_loss_clip": 0.01173388, + "auxiliary_loss_mlp": 0.01137036, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00085092, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.749508745123196, + "language_loss": 0.85711443, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.88021863, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.5273661613464355 + }, + { + "auxiliary_loss_clip": 0.0112565, + "auxiliary_loss_mlp": 0.0113724, + "balance_loss_clip": 1.00184929, + "balance_loss_mlp": 1.00076902, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 2.1968245211719077, + "language_loss": 0.7957536, + "learning_rate": 3.249240249232065e-06, + "loss": 0.8183825, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.5952649116516113 + }, + { + "auxiliary_loss_clip": 0.01124871, + "auxiliary_loss_mlp": 0.01136987, + "balance_loss_clip": 1.00189054, + "balance_loss_mlp": 1.00089765, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.6786385657575882, + "language_loss": 0.80166376, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82428235, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.628119468688965 + }, + { + "auxiliary_loss_clip": 0.01173474, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.00063205, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.829357143796359, + "language_loss": 0.88756227, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91066623, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.5202906131744385 + }, + { + "auxiliary_loss_clip": 0.01158475, + "auxiliary_loss_mlp": 0.01137082, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00089765, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.8125413799722065, + "language_loss": 0.73903203, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76198757, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.5441176891326904 + }, + { + "auxiliary_loss_clip": 0.01158273, + "auxiliary_loss_mlp": 0.00748084, + "balance_loss_clip": 1.00198543, + "balance_loss_mlp": 1.00068402, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 2.1397209360400837, + "language_loss": 0.73181176, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75087523, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.5604584217071533 + }, + { + "auxiliary_loss_clip": 0.01139978, + "auxiliary_loss_mlp": 0.01136851, + "balance_loss_clip": 1.00197566, + "balance_loss_mlp": 1.00085711, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 4.829797841946194, + "language_loss": 0.8720178, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89478612, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.6094393730163574 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.01137716, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00086403, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.6671231043902983, + "language_loss": 0.71950203, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.74212831, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.6434500217437744 + }, + { + "auxiliary_loss_clip": 0.01122706, + "auxiliary_loss_mlp": 0.01136466, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 1.00075817, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.221222820206517, + "language_loss": 0.72544932, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74804103, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 4.0877320766448975 + }, + { + "auxiliary_loss_clip": 0.01141146, + "auxiliary_loss_mlp": 0.01136989, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00090003, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.6154201109536115, + "language_loss": 0.85871142, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.88149279, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.6019413471221924 + }, + { + "auxiliary_loss_clip": 0.01141312, + "auxiliary_loss_mlp": 0.01136011, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00058949, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.9546338336018876, + "language_loss": 0.67392814, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69670147, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.61932110786438 + }, + { + "auxiliary_loss_clip": 0.01156903, + "auxiliary_loss_mlp": 0.0113595, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00071907, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.51091814001752, + "language_loss": 0.76961058, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79253918, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.601378917694092 + }, + { + "auxiliary_loss_clip": 0.0117353, + "auxiliary_loss_mlp": 0.01136669, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00057936, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 2.8794271165906844, + "language_loss": 0.66862416, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69172615, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.57179594039917 + }, + { + "auxiliary_loss_clip": 0.01156857, + "auxiliary_loss_mlp": 0.01137553, + "balance_loss_clip": 1.00199151, + "balance_loss_mlp": 1.00070095, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.7164764197854412, + "language_loss": 0.79565048, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81859452, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 3.9990952014923096 + }, + { + "auxiliary_loss_clip": 0.01129349, + "auxiliary_loss_mlp": 0.0074795, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00077581, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.8969379758708398, + "language_loss": 0.76923919, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78801221, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.599855899810791 + }, + { + "auxiliary_loss_clip": 0.01126122, + "auxiliary_loss_mlp": 0.01137315, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00084472, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.7776311210870295, + "language_loss": 0.62738526, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.65001965, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.618764638900757 + }, + { + "auxiliary_loss_clip": 0.01158249, + "auxiliary_loss_mlp": 0.01136922, + "balance_loss_clip": 1.00196636, + "balance_loss_mlp": 1.00073743, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.3962237435982585, + "language_loss": 0.82331556, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84626722, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.582111120223999 + }, + { + "auxiliary_loss_clip": 0.01139687, + "auxiliary_loss_mlp": 0.01136473, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00086081, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 2.2138658875674095, + "language_loss": 0.76267308, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78543466, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 3.9866034984588623 + }, + { + "auxiliary_loss_clip": 0.01124803, + "auxiliary_loss_mlp": 0.0113754, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00068736, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.1894629504670426, + "language_loss": 0.71522725, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.73785067, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 4.068415641784668 + }, + { + "auxiliary_loss_clip": 0.01107736, + "auxiliary_loss_mlp": 0.01135718, + "balance_loss_clip": 1.00159609, + "balance_loss_mlp": 1.00067747, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.596770791369557, + "language_loss": 0.74285591, + "learning_rate": 3.243758033520219e-06, + "loss": 0.7652905, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.6634976863861084 + }, + { + "auxiliary_loss_clip": 0.01156925, + "auxiliary_loss_mlp": 0.01136779, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00097537, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.8090160571812306, + "language_loss": 0.80549383, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82843083, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.578538179397583 + }, + { + "auxiliary_loss_clip": 0.01158098, + "auxiliary_loss_mlp": 0.01136632, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00073361, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.6073985659327257, + "language_loss": 0.79960686, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82255423, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.5010783672332764 + }, + { + "auxiliary_loss_clip": 0.01141771, + "auxiliary_loss_mlp": 0.01136292, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.0007745, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5502651275124926, + "language_loss": 0.82559407, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84837466, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.6286978721618652 + }, + { + "auxiliary_loss_clip": 0.01154326, + "auxiliary_loss_mlp": 0.01123676, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00084305, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7462010896636099, + "language_loss": 0.58615649, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60893655, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.2762062549591064 + }, + { + "auxiliary_loss_clip": 0.01158289, + "auxiliary_loss_mlp": 0.00747925, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00075221, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.8435851597245743, + "language_loss": 0.8359192, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85498136, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.5835838317871094 + }, + { + "auxiliary_loss_clip": 0.01173448, + "auxiliary_loss_mlp": 0.01136527, + "balance_loss_clip": 1.00204194, + "balance_loss_mlp": 1.00081885, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.102395653531446, + "language_loss": 0.78639156, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80949134, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.5380377769470215 + }, + { + "auxiliary_loss_clip": 0.01158206, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_clip": 1.00205219, + "balance_loss_mlp": 1.00074887, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 2.045811227889034, + "language_loss": 0.6471833, + "learning_rate": 3.241621930235989e-06, + "loss": 0.67013466, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.523646354675293 + }, + { + "auxiliary_loss_clip": 0.01107969, + "auxiliary_loss_mlp": 0.01134991, + "balance_loss_clip": 1.00166106, + "balance_loss_mlp": 1.00071335, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5323538499686282, + "language_loss": 0.8676976, + "learning_rate": 3.241316584201646e-06, + "loss": 0.89012712, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.703144073486328 + }, + { + "auxiliary_loss_clip": 0.0111285, + "auxiliary_loss_mlp": 0.01136581, + "balance_loss_clip": 1.00158405, + "balance_loss_mlp": 1.00058699, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.7830215243870604, + "language_loss": 0.68616223, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70865655, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.773580551147461 + }, + { + "auxiliary_loss_clip": 0.01156786, + "auxiliary_loss_mlp": 0.00747959, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00067019, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.9706167454500383, + "language_loss": 0.71243691, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73148441, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.01120825, + "auxiliary_loss_mlp": 0.0112239, + "balance_loss_clip": 1.00177145, + "balance_loss_mlp": 1.00031972, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8304037762663616, + "language_loss": 0.59200954, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61444163, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.172470808029175 + }, + { + "auxiliary_loss_clip": 0.01141334, + "auxiliary_loss_mlp": 0.0113729, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00091493, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1647772211459646, + "language_loss": 0.72554851, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.74833477, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.5969655513763428 + }, + { + "auxiliary_loss_clip": 0.01124903, + "auxiliary_loss_mlp": 0.01136959, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00077474, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.6118077483745923, + "language_loss": 0.71229708, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73491573, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.649117946624756 + }, + { + "auxiliary_loss_clip": 0.01173299, + "auxiliary_loss_mlp": 0.00747961, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.0007391, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.967210489668679, + "language_loss": 0.90736258, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92657518, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.5027310848236084 + }, + { + "auxiliary_loss_clip": 0.01141361, + "auxiliary_loss_mlp": 0.01136573, + "balance_loss_clip": 1.00189769, + "balance_loss_mlp": 1.00067437, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.789854608941773, + "language_loss": 0.67133355, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69411284, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.78135347366333 + }, + { + "auxiliary_loss_clip": 0.01157425, + "auxiliary_loss_mlp": 0.0113786, + "balance_loss_clip": 1.00235569, + "balance_loss_mlp": 1.00081682, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 4.284185099607121, + "language_loss": 0.83448327, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85743612, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.580733060836792 + }, + { + "auxiliary_loss_clip": 0.01122692, + "auxiliary_loss_mlp": 0.01122187, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00011718, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.6984237344877559, + "language_loss": 0.55295831, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57540721, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.3269882202148438 + }, + { + "auxiliary_loss_clip": 0.01140497, + "auxiliary_loss_mlp": 0.00748116, + "balance_loss_clip": 1.00214994, + "balance_loss_mlp": 1.00091028, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.8236155917983998, + "language_loss": 0.76025569, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.77914184, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.0728979110717773 + }, + { + "auxiliary_loss_clip": 0.01123026, + "auxiliary_loss_mlp": 0.01135639, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00069368, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.9222993070881027, + "language_loss": 0.80189496, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82448161, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.6047556400299072 + }, + { + "auxiliary_loss_clip": 0.0111152, + "auxiliary_loss_mlp": 0.0113784, + "balance_loss_clip": 1.00227928, + "balance_loss_mlp": 1.00098825, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.3835318521465152, + "language_loss": 0.81439531, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83688891, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.712737798690796 + }, + { + "auxiliary_loss_clip": 0.01157049, + "auxiliary_loss_mlp": 0.01137256, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00069022, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.4344831105548526, + "language_loss": 0.77250105, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79544413, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.5755951404571533 + }, + { + "auxiliary_loss_clip": 0.01124255, + "auxiliary_loss_mlp": 0.01135785, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00093579, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 1.9386845059333906, + "language_loss": 0.78580689, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80840731, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.6075427532196045 + }, + { + "auxiliary_loss_clip": 0.01145845, + "auxiliary_loss_mlp": 0.011368, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00071073, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 3.470287996816184, + "language_loss": 0.87485886, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89768529, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.5635881423950195 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01137138, + "balance_loss_clip": 1.00194728, + "balance_loss_mlp": 1.00085831, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.9494002705961648, + "language_loss": 0.78796661, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81090605, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.5315608978271484 + }, + { + "auxiliary_loss_clip": 0.01123231, + "auxiliary_loss_mlp": 0.01136135, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.0006175, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 2.0353848890627297, + "language_loss": 0.72105837, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74365205, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.6411619186401367 + }, + { + "auxiliary_loss_clip": 0.01157876, + "auxiliary_loss_mlp": 0.011367, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00089693, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 2.0931278303362166, + "language_loss": 0.73776835, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76071411, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 4.007487058639526 + }, + { + "auxiliary_loss_clip": 0.011473, + "auxiliary_loss_mlp": 0.01136507, + "balance_loss_clip": 1.00197828, + "balance_loss_mlp": 1.00089395, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6792847582967294, + "language_loss": 0.76908064, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.79191869, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.5994884967803955 + }, + { + "auxiliary_loss_clip": 0.01141527, + "auxiliary_loss_mlp": 0.01136507, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00060809, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 1.7068836235887739, + "language_loss": 0.66610968, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68888998, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.5674004554748535 + }, + { + "auxiliary_loss_clip": 0.01156934, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00090003, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.911857986896729, + "language_loss": 0.75061589, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77355325, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.571305513381958 + }, + { + "auxiliary_loss_clip": 0.01156826, + "auxiliary_loss_mlp": 0.01136954, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00086427, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.0506213707655716, + "language_loss": 0.7254563, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74839407, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.4810385704040527 + }, + { + "auxiliary_loss_clip": 0.01108978, + "auxiliary_loss_mlp": 0.01137312, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.00084066, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.3565744472346783, + "language_loss": 0.84786427, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87032717, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.6607115268707275 + }, + { + "auxiliary_loss_clip": 0.011095, + "auxiliary_loss_mlp": 0.01136903, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00071836, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.6709824519707535, + "language_loss": 0.78720975, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80967379, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 4.059000253677368 + }, + { + "auxiliary_loss_clip": 0.01156854, + "auxiliary_loss_mlp": 0.01136929, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00084007, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 1.9931280445860622, + "language_loss": 0.67315716, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69609499, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.5120725631713867 + }, + { + "auxiliary_loss_clip": 0.01091151, + "auxiliary_loss_mlp": 0.01136785, + "balance_loss_clip": 1.00166953, + "balance_loss_mlp": 1.00088668, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 1.8920673813475621, + "language_loss": 0.83222717, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.85450649, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.7438204288482666 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.00748096, + "balance_loss_clip": 1.00180495, + "balance_loss_mlp": 1.00091052, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.695694995769264, + "language_loss": 0.73598635, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.75488013, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 5.439128398895264 + }, + { + "auxiliary_loss_clip": 0.01158157, + "auxiliary_loss_mlp": 0.01136689, + "balance_loss_clip": 1.00206196, + "balance_loss_mlp": 1.00069523, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 3.200865600992773, + "language_loss": 0.76271832, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78566682, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 2.544013738632202 + }, + { + "auxiliary_loss_clip": 0.01142364, + "auxiliary_loss_mlp": 0.01136904, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00091028, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7264346290605073, + "language_loss": 0.78981274, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81260544, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.5584378242492676 + }, + { + "auxiliary_loss_clip": 0.01157309, + "auxiliary_loss_mlp": 0.01138188, + "balance_loss_clip": 1.00221717, + "balance_loss_mlp": 1.00085855, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.320816342263667, + "language_loss": 0.75282478, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77577978, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.546767234802246 + }, + { + "auxiliary_loss_clip": 0.01127017, + "auxiliary_loss_mlp": 0.01135959, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00072813, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.5910695389216802, + "language_loss": 0.69358718, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71621698, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.6807069778442383 + }, + { + "auxiliary_loss_clip": 0.01107368, + "auxiliary_loss_mlp": 0.01135527, + "balance_loss_clip": 1.00174248, + "balance_loss_mlp": 1.00086784, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8742549881467416, + "language_loss": 0.84775448, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87018341, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.657893657684326 + }, + { + "auxiliary_loss_clip": 0.01140154, + "auxiliary_loss_mlp": 0.01136874, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00068927, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 2.2582114427145332, + "language_loss": 0.84993619, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87270647, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.5568110942840576 + }, + { + "auxiliary_loss_clip": 0.01157648, + "auxiliary_loss_mlp": 0.01136522, + "balance_loss_clip": 1.00201833, + "balance_loss_mlp": 1.00081432, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 1.9811824090323176, + "language_loss": 0.75509006, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77803171, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.588118076324463 + }, + { + "auxiliary_loss_clip": 0.01156739, + "auxiliary_loss_mlp": 0.0113706, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.00078011, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9669356580162243, + "language_loss": 0.81703162, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83996964, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.496567964553833 + }, + { + "auxiliary_loss_clip": 0.01156186, + "auxiliary_loss_mlp": 0.01136129, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00080287, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.5386308229450452, + "language_loss": 0.82910389, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.852027, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.519908905029297 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01136808, + "balance_loss_clip": 1.00222099, + "balance_loss_mlp": 1.00081372, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 2.053299596436567, + "language_loss": 0.76319814, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78630155, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.5032448768615723 + }, + { + "auxiliary_loss_clip": 0.01115361, + "auxiliary_loss_mlp": 0.01136918, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.0008285, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.8553579247493077, + "language_loss": 0.74654913, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76907194, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.6500062942504883 + }, + { + "auxiliary_loss_clip": 0.0112534, + "auxiliary_loss_mlp": 0.01136499, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00079107, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.569425794714368, + "language_loss": 0.75938702, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78200537, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.5962820053100586 + }, + { + "auxiliary_loss_clip": 0.01141119, + "auxiliary_loss_mlp": 0.0113688, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.0007906, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.1401198873699836, + "language_loss": 0.73516691, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75794691, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.546234130859375 + }, + { + "auxiliary_loss_clip": 0.01106103, + "auxiliary_loss_mlp": 0.0112288, + "balance_loss_clip": 1.00220513, + "balance_loss_mlp": 1.00004637, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7077705295999329, + "language_loss": 0.52981174, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55210149, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.307523488998413 + }, + { + "auxiliary_loss_clip": 0.01156788, + "auxiliary_loss_mlp": 0.01136959, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00087011, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 2.3148893205018757, + "language_loss": 0.78656614, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80950356, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.5332303047180176 + }, + { + "auxiliary_loss_clip": 0.01141327, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00082755, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.4444430239231212, + "language_loss": 0.64187813, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66465193, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.7035069465637207 + }, + { + "auxiliary_loss_clip": 0.01125856, + "auxiliary_loss_mlp": 0.00748351, + "balance_loss_clip": 1.00244594, + "balance_loss_mlp": 1.00111794, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.176530723388042, + "language_loss": 0.77701497, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79575706, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.693493604660034 + }, + { + "auxiliary_loss_clip": 0.01114646, + "auxiliary_loss_mlp": 0.01136677, + "balance_loss_clip": 1.00223243, + "balance_loss_mlp": 1.00115991, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 2.3362173916358966, + "language_loss": 0.83780688, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86032009, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.667491912841797 + }, + { + "auxiliary_loss_clip": 0.01109312, + "auxiliary_loss_mlp": 0.01137269, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00089359, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.8750674010029116, + "language_loss": 0.83763134, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86009705, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.643350601196289 + }, + { + "auxiliary_loss_clip": 0.01173344, + "auxiliary_loss_mlp": 0.01135702, + "balance_loss_clip": 1.00215673, + "balance_loss_mlp": 1.00075758, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.7465606174857229, + "language_loss": 0.83778775, + "learning_rate": 3.226912425313001e-06, + "loss": 0.86087817, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.558257579803467 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00100327, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.1663631353363724, + "language_loss": 0.85141414, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87419331, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.5858354568481445 + }, + { + "auxiliary_loss_clip": 0.0109501, + "auxiliary_loss_mlp": 0.01137076, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00089109, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6949891159956767, + "language_loss": 0.8340342, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85635507, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.7469100952148438 + }, + { + "auxiliary_loss_clip": 0.01157948, + "auxiliary_loss_mlp": 0.0113568, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00064015, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 2.077733407885971, + "language_loss": 0.80968338, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.83261967, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.5403714179992676 + }, + { + "auxiliary_loss_clip": 0.01158167, + "auxiliary_loss_mlp": 0.007481, + "balance_loss_clip": 1.00223756, + "balance_loss_mlp": 1.00093317, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.594840643889915, + "language_loss": 0.8028568, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.8219195, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.575741767883301 + }, + { + "auxiliary_loss_clip": 0.01140546, + "auxiliary_loss_mlp": 0.01137192, + "balance_loss_clip": 1.00219405, + "balance_loss_mlp": 1.00072157, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.92548820852977, + "language_loss": 0.81674862, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83952606, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.5671212673187256 + }, + { + "auxiliary_loss_clip": 0.01124317, + "auxiliary_loss_mlp": 0.01136303, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00107205, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.6402998208242057, + "language_loss": 0.78079629, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80340242, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.5993809700012207 + }, + { + "auxiliary_loss_clip": 0.01110272, + "auxiliary_loss_mlp": 0.01135468, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00071383, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 2.1751647678997497, + "language_loss": 0.83512026, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85757768, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.7316534519195557 + }, + { + "auxiliary_loss_clip": 0.01126099, + "auxiliary_loss_mlp": 0.01135764, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00091398, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 2.178131697726592, + "language_loss": 0.7404834, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76310199, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 4.1112494468688965 + }, + { + "auxiliary_loss_clip": 0.0110805, + "auxiliary_loss_mlp": 0.00748059, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.000893, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8802453077813173, + "language_loss": 0.71057856, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72913969, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.7669990062713623 + }, + { + "auxiliary_loss_clip": 0.01124671, + "auxiliary_loss_mlp": 0.01123054, + "balance_loss_clip": 1.00259149, + "balance_loss_mlp": 1.00022078, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9447240928581457, + "language_loss": 0.59676665, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61924386, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.2242414951324463 + }, + { + "auxiliary_loss_clip": 0.01141329, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_clip": 1.00202429, + "balance_loss_mlp": 1.00101352, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.1031373348722346, + "language_loss": 0.70057893, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72335649, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.5728025436401367 + }, + { + "auxiliary_loss_clip": 0.01126753, + "auxiliary_loss_mlp": 0.01136171, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00103486, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.884776880591155, + "language_loss": 0.63438737, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.65701663, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.590705633163452 + }, + { + "auxiliary_loss_clip": 0.0114117, + "auxiliary_loss_mlp": 0.01136178, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00085139, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 1.9142708827779973, + "language_loss": 0.86526215, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.8880356, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.608280658721924 + }, + { + "auxiliary_loss_clip": 0.01173452, + "auxiliary_loss_mlp": 0.00748181, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00099087, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.609644164945035, + "language_loss": 0.63253808, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.65175444, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.6443135738372803 + }, + { + "auxiliary_loss_clip": 0.0112518, + "auxiliary_loss_mlp": 0.01136836, + "balance_loss_clip": 1.00230873, + "balance_loss_mlp": 1.00084209, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.3505146199110056, + "language_loss": 0.82636833, + "learning_rate": 3.222293661638346e-06, + "loss": 0.84898841, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 4.046981573104858 + }, + { + "auxiliary_loss_clip": 0.01045349, + "auxiliary_loss_mlp": 0.01135812, + "balance_loss_clip": 1.00165808, + "balance_loss_mlp": 1.00077176, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 2.1175902130537745, + "language_loss": 0.79063421, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81244588, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 2.8554303646087646 + }, + { + "auxiliary_loss_clip": 0.01112074, + "auxiliary_loss_mlp": 0.01136025, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00079405, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.541197691724149, + "language_loss": 0.75142491, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77390599, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 3.068235158920288 + }, + { + "auxiliary_loss_clip": 0.01155505, + "auxiliary_loss_mlp": 0.00746804, + "balance_loss_clip": 1.00297737, + "balance_loss_mlp": 1.0002799, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8417822893060392, + "language_loss": 0.63923633, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65825939, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 6.514969825744629 + }, + { + "auxiliary_loss_clip": 0.01156713, + "auxiliary_loss_mlp": 0.01135794, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00065827, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.7748649339518556, + "language_loss": 0.803738, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82666308, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.67124080657959 + }, + { + "auxiliary_loss_clip": 0.01141245, + "auxiliary_loss_mlp": 0.01136532, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00082469, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.880228361334416, + "language_loss": 0.72257465, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74535245, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 2.6383862495422363 + }, + { + "auxiliary_loss_clip": 0.01173449, + "auxiliary_loss_mlp": 0.01135676, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00082624, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.5041607589119161, + "language_loss": 0.76503861, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78812981, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.536417245864868 + }, + { + "auxiliary_loss_clip": 0.01173508, + "auxiliary_loss_mlp": 0.01136189, + "balance_loss_clip": 1.00224841, + "balance_loss_mlp": 1.00086248, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.5964736576206968, + "language_loss": 0.77677333, + "learning_rate": 3.220134667280476e-06, + "loss": 0.79987031, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.563964366912842 + }, + { + "auxiliary_loss_clip": 0.01140356, + "auxiliary_loss_mlp": 0.0074679, + "balance_loss_clip": 1.00305128, + "balance_loss_mlp": 1.00035858, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7766912500802254, + "language_loss": 0.54776299, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56663448, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.1822807788848877 + }, + { + "auxiliary_loss_clip": 0.01173269, + "auxiliary_loss_mlp": 0.01135337, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.0008688, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.5820148379566774, + "language_loss": 0.66612148, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68920755, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.497943878173828 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01136523, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00072026, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.2893907279792867, + "language_loss": 0.69702756, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71965307, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.6258111000061035 + }, + { + "auxiliary_loss_clip": 0.01157998, + "auxiliary_loss_mlp": 0.01135302, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00092936, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 3.562528820093866, + "language_loss": 0.79066104, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81359404, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.512983798980713 + }, + { + "auxiliary_loss_clip": 0.01157387, + "auxiliary_loss_mlp": 0.01134843, + "balance_loss_clip": 1.00208843, + "balance_loss_mlp": 1.00075638, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 3.2802815805984187, + "language_loss": 0.83609557, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85901779, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.546947717666626 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01135624, + "balance_loss_clip": 1.00222945, + "balance_loss_mlp": 1.00077438, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.2255276129546098, + "language_loss": 0.69473553, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71782625, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.4621081352233887 + }, + { + "auxiliary_loss_clip": 0.01173421, + "auxiliary_loss_mlp": 0.01135944, + "balance_loss_clip": 1.00217319, + "balance_loss_mlp": 1.00090408, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.948646475655168, + "language_loss": 0.83811253, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86120617, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.4724314212799072 + }, + { + "auxiliary_loss_clip": 0.01108497, + "auxiliary_loss_mlp": 0.01136562, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.00066364, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.1328555170924175, + "language_loss": 0.60169542, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62414598, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.6879374980926514 + }, + { + "auxiliary_loss_clip": 0.01141519, + "auxiliary_loss_mlp": 0.01135603, + "balance_loss_clip": 1.00216961, + "balance_loss_mlp": 1.00075352, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.7191231282366586, + "language_loss": 0.65836281, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68113405, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.606384515762329 + }, + { + "auxiliary_loss_clip": 0.01158169, + "auxiliary_loss_mlp": 0.01135343, + "balance_loss_clip": 1.00215781, + "balance_loss_mlp": 1.0009706, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 2.0688181397429672, + "language_loss": 0.76745427, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79038942, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.5811779499053955 + }, + { + "auxiliary_loss_clip": 0.01173286, + "auxiliary_loss_mlp": 0.01135199, + "balance_loss_clip": 1.00218964, + "balance_loss_mlp": 1.00073099, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 1.9569019644408894, + "language_loss": 0.82806396, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85114884, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.496934175491333 + }, + { + "auxiliary_loss_clip": 0.01156528, + "auxiliary_loss_mlp": 0.0113516, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00078738, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5061292759159617, + "language_loss": 0.71229184, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73520875, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.5596039295196533 + }, + { + "auxiliary_loss_clip": 0.01140987, + "auxiliary_loss_mlp": 0.01135242, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00077343, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.9737259984054312, + "language_loss": 0.74695498, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76971728, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.5737173557281494 + }, + { + "auxiliary_loss_clip": 0.01173236, + "auxiliary_loss_mlp": 0.0113487, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00078321, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.81231022897915, + "language_loss": 0.77604711, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79912817, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.5041909217834473 + }, + { + "auxiliary_loss_clip": 0.01157529, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00091577, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8364228449996358, + "language_loss": 0.79182434, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81474769, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.5527546405792236 + }, + { + "auxiliary_loss_clip": 0.01157611, + "auxiliary_loss_mlp": 0.01134254, + "balance_loss_clip": 1.00208282, + "balance_loss_mlp": 1.00073981, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.6412106487463936, + "language_loss": 0.79543519, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81835389, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.561352014541626 + }, + { + "auxiliary_loss_clip": 0.01147483, + "auxiliary_loss_mlp": 0.01135866, + "balance_loss_clip": 1.00228488, + "balance_loss_mlp": 1.00101638, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.798025925136483, + "language_loss": 0.71401012, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73684353, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.6208643913269043 + }, + { + "auxiliary_loss_clip": 0.01156662, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00085902, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 1.8297597916461048, + "language_loss": 0.77275109, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79567194, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.5344464778900146 + }, + { + "auxiliary_loss_clip": 0.01108263, + "auxiliary_loss_mlp": 0.01134731, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00073981, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6559898891821776, + "language_loss": 0.82600558, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84843552, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.703383684158325 + }, + { + "auxiliary_loss_clip": 0.0115686, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_clip": 1.00209582, + "balance_loss_mlp": 1.00076914, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 1.9315769394328208, + "language_loss": 0.78987163, + "learning_rate": 3.213953633415686e-06, + "loss": 0.8127926, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.5354866981506348 + }, + { + "auxiliary_loss_clip": 0.01141792, + "auxiliary_loss_mlp": 0.01135942, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00061512, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.7652747623933922, + "language_loss": 0.6840409, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70681822, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.647611618041992 + }, + { + "auxiliary_loss_clip": 0.01142348, + "auxiliary_loss_mlp": 0.01135345, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00078154, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.503703632160304, + "language_loss": 0.80742264, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83019954, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.5700252056121826 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.0113562, + "balance_loss_clip": 1.00215471, + "balance_loss_mlp": 1.00077081, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.1301405976533045, + "language_loss": 0.69112945, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.7142179, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.486842393875122 + }, + { + "auxiliary_loss_clip": 0.01140823, + "auxiliary_loss_mlp": 0.01135377, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00071836, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 2.1862472207554116, + "language_loss": 0.80349112, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.82625312, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 4.06183385848999 + }, + { + "auxiliary_loss_clip": 0.01158082, + "auxiliary_loss_mlp": 0.01135313, + "balance_loss_clip": 1.00218785, + "balance_loss_mlp": 1.00103605, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.6872985443104542, + "language_loss": 0.7341609, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75709486, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.5433149337768555 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01135171, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.0007987, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.8076804503683488, + "language_loss": 0.81870633, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84147251, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.581043004989624 + }, + { + "auxiliary_loss_clip": 0.01157679, + "auxiliary_loss_mlp": 0.01135494, + "balance_loss_clip": 1.00209928, + "balance_loss_mlp": 1.00093031, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.9217533074518154, + "language_loss": 0.70414066, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72707242, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.520613431930542 + }, + { + "auxiliary_loss_clip": 0.01157862, + "auxiliary_loss_mlp": 0.00748054, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.0008533, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6050916210172608, + "language_loss": 0.80634373, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82540286, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.559452533721924 + }, + { + "auxiliary_loss_clip": 0.01156556, + "auxiliary_loss_mlp": 0.01134987, + "balance_loss_clip": 1.00210917, + "balance_loss_mlp": 1.00061476, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.1617262480530397, + "language_loss": 0.581267, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60418242, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.5825045108795166 + }, + { + "auxiliary_loss_clip": 0.01113196, + "auxiliary_loss_mlp": 0.01134504, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00079918, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.8975251135974438, + "language_loss": 0.81737745, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83985448, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.651111364364624 + }, + { + "auxiliary_loss_clip": 0.01157796, + "auxiliary_loss_mlp": 0.01135668, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00100923, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 1.8407254183208586, + "language_loss": 0.7430113, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76594591, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 3.978165864944458 + }, + { + "auxiliary_loss_clip": 0.01157201, + "auxiliary_loss_mlp": 0.01136781, + "balance_loss_clip": 1.00246346, + "balance_loss_mlp": 1.00078726, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.6271616423347632, + "language_loss": 0.67494404, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69788384, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.734398365020752 + }, + { + "auxiliary_loss_clip": 0.01139781, + "auxiliary_loss_mlp": 0.01134683, + "balance_loss_clip": 1.00207961, + "balance_loss_mlp": 1.00078726, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.8657138595141123, + "language_loss": 0.79921079, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82195544, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.5835001468658447 + }, + { + "auxiliary_loss_clip": 0.01139591, + "auxiliary_loss_mlp": 0.01135109, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00073624, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.7724283951404154, + "language_loss": 0.69805139, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72079837, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 4.078983545303345 + }, + { + "auxiliary_loss_clip": 0.01124467, + "auxiliary_loss_mlp": 0.01135181, + "balance_loss_clip": 1.00169706, + "balance_loss_mlp": 1.00061774, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.7790106686251523, + "language_loss": 0.79577929, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81837583, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.6983392238616943 + }, + { + "auxiliary_loss_clip": 0.01140439, + "auxiliary_loss_mlp": 0.01136074, + "balance_loss_clip": 1.00234854, + "balance_loss_mlp": 1.00084269, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.248064778025552, + "language_loss": 0.84807599, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87084109, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.5331196784973145 + }, + { + "auxiliary_loss_clip": 0.01109194, + "auxiliary_loss_mlp": 0.01135018, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00093138, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5288078830935843, + "language_loss": 0.80322742, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82566959, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.629988670349121 + }, + { + "auxiliary_loss_clip": 0.01109657, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00084698, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.6804943583859397, + "language_loss": 0.70821857, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73066545, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 2.960334539413452 + }, + { + "auxiliary_loss_clip": 0.01124533, + "auxiliary_loss_mlp": 0.01136024, + "balance_loss_clip": 1.00208533, + "balance_loss_mlp": 1.00079274, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 2.091193547329533, + "language_loss": 0.7234627, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74606824, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.655606269836426 + }, + { + "auxiliary_loss_clip": 0.01158005, + "auxiliary_loss_mlp": 0.01134508, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00070798, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.15210867460012, + "language_loss": 0.791704, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81462908, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.5326759815216064 + }, + { + "auxiliary_loss_clip": 0.01173319, + "auxiliary_loss_mlp": 0.01135155, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.00078201, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.4614529396479146, + "language_loss": 0.75982761, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78291231, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.5769729614257812 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01134916, + "balance_loss_clip": 1.00228214, + "balance_loss_mlp": 1.00073361, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.2625102483959627, + "language_loss": 0.79799175, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82107264, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.5246639251708984 + }, + { + "auxiliary_loss_clip": 0.01157035, + "auxiliary_loss_mlp": 0.01122203, + "balance_loss_clip": 1.00325906, + "balance_loss_mlp": 1.0001328, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8309967015150137, + "language_loss": 0.67865551, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70144784, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.128535747528076 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01136176, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00084925, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 1.9032521628135826, + "language_loss": 0.82588136, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84865093, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.5776875019073486 + }, + { + "auxiliary_loss_clip": 0.01140767, + "auxiliary_loss_mlp": 0.00748145, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.00088215, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.650059560030859, + "language_loss": 0.81441206, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83330119, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.6223318576812744 + }, + { + "auxiliary_loss_clip": 0.01173323, + "auxiliary_loss_mlp": 0.01135105, + "balance_loss_clip": 1.00236893, + "balance_loss_mlp": 1.00082755, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.7098096356056003, + "language_loss": 0.74500489, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76808923, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.592785596847534 + }, + { + "auxiliary_loss_clip": 0.01127313, + "auxiliary_loss_mlp": 0.01135465, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00080657, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9004155299185137, + "language_loss": 0.73956227, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7621901, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.663841485977173 + }, + { + "auxiliary_loss_clip": 0.01156471, + "auxiliary_loss_mlp": 0.01135461, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00080216, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 1.772573712865191, + "language_loss": 0.64755785, + "learning_rate": 3.205269272758513e-06, + "loss": 0.67047715, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.553138494491577 + }, + { + "auxiliary_loss_clip": 0.01106588, + "auxiliary_loss_mlp": 0.01135518, + "balance_loss_clip": 1.00169253, + "balance_loss_mlp": 1.00076365, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.117915587531034, + "language_loss": 0.91066039, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93308145, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.607712507247925 + }, + { + "auxiliary_loss_clip": 0.01156461, + "auxiliary_loss_mlp": 0.01135942, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00099754, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 2.562773713681604, + "language_loss": 0.75486386, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77778786, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.5778415203094482 + }, + { + "auxiliary_loss_clip": 0.01173172, + "auxiliary_loss_mlp": 0.01135224, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00075626, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.814922022574976, + "language_loss": 0.61498272, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63806665, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.603966236114502 + }, + { + "auxiliary_loss_clip": 0.01156647, + "auxiliary_loss_mlp": 0.01135467, + "balance_loss_clip": 1.0021739, + "balance_loss_mlp": 1.00080836, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.0858839284447663, + "language_loss": 0.82039928, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84332037, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.5234997272491455 + }, + { + "auxiliary_loss_clip": 0.01140528, + "auxiliary_loss_mlp": 0.01135355, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00088668, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.932847903491249, + "language_loss": 0.84624559, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86900443, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.5539541244506836 + }, + { + "auxiliary_loss_clip": 0.01128786, + "auxiliary_loss_mlp": 0.01136122, + "balance_loss_clip": 1.00220764, + "balance_loss_mlp": 1.00070059, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.7793292319658436, + "language_loss": 0.85777813, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.88042724, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.5998010635375977 + }, + { + "auxiliary_loss_clip": 0.01139673, + "auxiliary_loss_mlp": 0.01135579, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00063396, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.099862689852904, + "language_loss": 0.68874133, + "learning_rate": 3.203092573767835e-06, + "loss": 0.71149385, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.5537495613098145 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_clip": 1.00228751, + "balance_loss_mlp": 1.00067306, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7114789200579439, + "language_loss": 0.791278, + "learning_rate": 3.202781434189246e-06, + "loss": 0.8143633, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.5343925952911377 + }, + { + "auxiliary_loss_clip": 0.01158011, + "auxiliary_loss_mlp": 0.01134991, + "balance_loss_clip": 1.0023365, + "balance_loss_mlp": 1.00080872, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.8311186781818587, + "language_loss": 0.74362785, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76655781, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.5780179500579834 + }, + { + "auxiliary_loss_clip": 0.01145934, + "auxiliary_loss_mlp": 0.01135834, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.00060344, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.9311176506876049, + "language_loss": 0.73644865, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75926626, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.5891778469085693 + }, + { + "auxiliary_loss_clip": 0.0115663, + "auxiliary_loss_mlp": 0.01135663, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00071859, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 3.514163788078146, + "language_loss": 0.77844816, + "learning_rate": 3.201847741843128e-06, + "loss": 0.8013711, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.5190043449401855 + }, + { + "auxiliary_loss_clip": 0.01139393, + "auxiliary_loss_mlp": 0.01135003, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.00072575, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.0698480184419266, + "language_loss": 0.78272218, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80546618, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.611815929412842 + }, + { + "auxiliary_loss_clip": 0.01123053, + "auxiliary_loss_mlp": 0.01134189, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00086546, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.8649865122059166, + "language_loss": 0.71610343, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73867583, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.6142327785491943 + }, + { + "auxiliary_loss_clip": 0.01156791, + "auxiliary_loss_mlp": 0.01135577, + "balance_loss_clip": 1.00217068, + "balance_loss_mlp": 1.00091839, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 2.0656905400438097, + "language_loss": 0.76539916, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78832287, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 4.0953590869903564 + }, + { + "auxiliary_loss_clip": 0.01141525, + "auxiliary_loss_mlp": 0.01135419, + "balance_loss_clip": 1.0021522, + "balance_loss_mlp": 1.00076056, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 1.9423244337999364, + "language_loss": 0.72646016, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74922961, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.5859553813934326 + }, + { + "auxiliary_loss_clip": 0.01141588, + "auxiliary_loss_mlp": 0.00748086, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00093913, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.772805772958685, + "language_loss": 0.6691041, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68800092, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.6220836639404297 + }, + { + "auxiliary_loss_clip": 0.0115745, + "auxiliary_loss_mlp": 0.01135036, + "balance_loss_clip": 1.00210059, + "balance_loss_mlp": 1.00056767, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 2.1717499129138984, + "language_loss": 0.72282159, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74574643, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.5842225551605225 + }, + { + "auxiliary_loss_clip": 0.01155454, + "auxiliary_loss_mlp": 0.01121408, + "balance_loss_clip": 1.0033617, + "balance_loss_mlp": 1.00010037, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7437616644502201, + "language_loss": 0.50654459, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52931321, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.172863006591797 + }, + { + "auxiliary_loss_clip": 0.01156618, + "auxiliary_loss_mlp": 0.01135855, + "balance_loss_clip": 1.00224674, + "balance_loss_mlp": 1.00062394, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.6000304415203253, + "language_loss": 0.85446405, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87738878, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.5750114917755127 + }, + { + "auxiliary_loss_clip": 0.01140942, + "auxiliary_loss_mlp": 0.01134269, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.0008496, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4935723600976578, + "language_loss": 0.81624103, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83899319, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 4.117549896240234 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01135483, + "balance_loss_clip": 1.00224257, + "balance_loss_mlp": 1.00063324, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6636682332930708, + "language_loss": 0.79745233, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82023466, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.553661346435547 + }, + { + "auxiliary_loss_clip": 0.01156616, + "auxiliary_loss_mlp": 0.0113558, + "balance_loss_clip": 1.00231731, + "balance_loss_mlp": 1.00082612, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.693429705973354, + "language_loss": 0.74824637, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77116829, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.549586296081543 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01135696, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00075078, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.04512244665143, + "language_loss": 0.78935498, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81197047, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.6025774478912354 + }, + { + "auxiliary_loss_clip": 0.01139373, + "auxiliary_loss_mlp": 0.01122186, + "balance_loss_clip": 1.00351703, + "balance_loss_mlp": 1.00011539, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7278163412561867, + "language_loss": 0.57735002, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59996563, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 4.623117208480835 + }, + { + "auxiliary_loss_clip": 0.01173265, + "auxiliary_loss_mlp": 0.01135026, + "balance_loss_clip": 1.00220418, + "balance_loss_mlp": 1.0005579, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.730383443557868, + "language_loss": 0.72626662, + "learning_rate": 3.197485092719815e-06, + "loss": 0.74934953, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.49837589263916 + }, + { + "auxiliary_loss_clip": 0.01124032, + "auxiliary_loss_mlp": 0.01135139, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00095749, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 1.896591238089769, + "language_loss": 0.79790395, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82049567, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.6227614879608154 + }, + { + "auxiliary_loss_clip": 0.01173327, + "auxiliary_loss_mlp": 0.01135816, + "balance_loss_clip": 1.00227189, + "balance_loss_mlp": 1.00077617, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.5456541837127413, + "language_loss": 0.79572779, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81881917, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.4784159660339355 + }, + { + "auxiliary_loss_clip": 0.01173377, + "auxiliary_loss_mlp": 0.01135517, + "balance_loss_clip": 1.002352, + "balance_loss_mlp": 1.00076294, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.9048820517587792, + "language_loss": 0.72878623, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75187522, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.4946320056915283 + }, + { + "auxiliary_loss_clip": 0.01141325, + "auxiliary_loss_mlp": 0.01136158, + "balance_loss_clip": 1.00216794, + "balance_loss_mlp": 1.00073647, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.228188042340659, + "language_loss": 0.69383663, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.7166115, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.791036367416382 + }, + { + "auxiliary_loss_clip": 0.01156518, + "auxiliary_loss_mlp": 0.00747972, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00086522, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.7908112188747918, + "language_loss": 0.67769343, + "learning_rate": 3.195924845146795e-06, + "loss": 0.6967383, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.5713276863098145 + }, + { + "auxiliary_loss_clip": 0.01124153, + "auxiliary_loss_mlp": 0.01134623, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.00091767, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.50490930778918, + "language_loss": 0.80853879, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83112651, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.6941542625427246 + }, + { + "auxiliary_loss_clip": 0.01157564, + "auxiliary_loss_mlp": 0.0113514, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.0008626, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 4.8598616271696695, + "language_loss": 0.73082805, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75375509, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.5511977672576904 + }, + { + "auxiliary_loss_clip": 0.01141451, + "auxiliary_loss_mlp": 0.01134745, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00065839, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3826995663782342, + "language_loss": 0.77896094, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80172288, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.6440272331237793 + }, + { + "auxiliary_loss_clip": 0.01141, + "auxiliary_loss_mlp": 0.01135466, + "balance_loss_clip": 1.00214565, + "balance_loss_mlp": 1.00080717, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.6072660391782576, + "language_loss": 0.78999144, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81275612, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.575364112854004 + }, + { + "auxiliary_loss_clip": 0.01155693, + "auxiliary_loss_mlp": 0.01121416, + "balance_loss_clip": 1.00375903, + "balance_loss_mlp": 1.0001086, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8793449174053072, + "language_loss": 0.62851429, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.65128529, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.907789707183838 + }, + { + "auxiliary_loss_clip": 0.01173329, + "auxiliary_loss_mlp": 0.01136043, + "balance_loss_clip": 1.00223994, + "balance_loss_mlp": 1.00081217, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.4829548914045754, + "language_loss": 0.80851495, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83160865, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.5401382446289062 + }, + { + "auxiliary_loss_clip": 0.01122886, + "auxiliary_loss_mlp": 0.01135437, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00106406, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.48001596397971, + "language_loss": 0.78497434, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80755758, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.7003989219665527 + }, + { + "auxiliary_loss_clip": 0.01140769, + "auxiliary_loss_mlp": 0.01134756, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00076497, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.5245519764392408, + "language_loss": 0.78043926, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80319452, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.6180315017700195 + }, + { + "auxiliary_loss_clip": 0.01142735, + "auxiliary_loss_mlp": 0.01135892, + "balance_loss_clip": 1.00225019, + "balance_loss_mlp": 1.00085175, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.14130716055872, + "language_loss": 0.67444164, + "learning_rate": 3.193113543486061e-06, + "loss": 0.6972279, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.6203408241271973 + }, + { + "auxiliary_loss_clip": 0.01155587, + "auxiliary_loss_mlp": 0.01121346, + "balance_loss_clip": 1.00366807, + "balance_loss_mlp": 1.00003839, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.735809533429642, + "language_loss": 0.52742064, + "learning_rate": 3.192800950261958e-06, + "loss": 0.55018997, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.156529188156128 + }, + { + "auxiliary_loss_clip": 0.01140041, + "auxiliary_loss_mlp": 0.01136036, + "balance_loss_clip": 1.00220513, + "balance_loss_mlp": 1.00090027, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.6319708366046055, + "language_loss": 0.70676249, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.7295233, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.577266216278076 + }, + { + "auxiliary_loss_clip": 0.01172552, + "auxiliary_loss_mlp": 0.01121301, + "balance_loss_clip": 1.00380778, + "balance_loss_mlp": 0.99999392, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8197443017657543, + "language_loss": 0.60499239, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.627931, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.1375739574432373 + }, + { + "auxiliary_loss_clip": 0.01173309, + "auxiliary_loss_mlp": 0.01135263, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00079513, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 3.829624199817799, + "language_loss": 0.72204757, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74513328, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.5065863132476807 + }, + { + "auxiliary_loss_clip": 0.01156551, + "auxiliary_loss_mlp": 0.0113547, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00081134, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 2.362398392672988, + "language_loss": 0.75529122, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77821136, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.553973436355591 + }, + { + "auxiliary_loss_clip": 0.01157125, + "auxiliary_loss_mlp": 0.01134235, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00062561, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 1.635234192950542, + "language_loss": 0.87969834, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.90261197, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.5503673553466797 + }, + { + "auxiliary_loss_clip": 0.01156359, + "auxiliary_loss_mlp": 0.01134746, + "balance_loss_clip": 1.0021193, + "balance_loss_mlp": 1.00065947, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.5424886570115497, + "language_loss": 0.6795491, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70246017, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.5481576919555664 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01135522, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00076795, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 5.259114224127761, + "language_loss": 0.8001225, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82287723, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.6165897846221924 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01135004, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00063157, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 1.9689019621896826, + "language_loss": 0.79890537, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82134426, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.64304518699646 + }, + { + "auxiliary_loss_clip": 0.01162429, + "auxiliary_loss_mlp": 0.01133958, + "balance_loss_clip": 1.00242162, + "balance_loss_mlp": 1.00082541, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 2.23973834899478, + "language_loss": 0.75288635, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77585018, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.536564826965332 + }, + { + "auxiliary_loss_clip": 0.01156521, + "auxiliary_loss_mlp": 0.01134291, + "balance_loss_clip": 1.00230968, + "balance_loss_mlp": 1.00087166, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 2.077295714665781, + "language_loss": 0.74031699, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76322508, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.5942373275756836 + }, + { + "auxiliary_loss_clip": 0.01173243, + "auxiliary_loss_mlp": 0.01135576, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00072622, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 3.1113673591136206, + "language_loss": 0.7580986, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78118682, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.4915719032287598 + }, + { + "auxiliary_loss_clip": 0.01130739, + "auxiliary_loss_mlp": 0.01135468, + "balance_loss_clip": 1.00249088, + "balance_loss_mlp": 1.00090444, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.4777297787074097, + "language_loss": 0.69529003, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71795207, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 4.168200731277466 + }, + { + "auxiliary_loss_clip": 0.01139737, + "auxiliary_loss_mlp": 0.01135075, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00089335, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.6669835457035214, + "language_loss": 0.77670217, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79945028, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.6304969787597656 + }, + { + "auxiliary_loss_clip": 0.01124399, + "auxiliary_loss_mlp": 0.01134338, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.00053728, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.743319558489655, + "language_loss": 0.79991853, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.82250589, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.7795026302337646 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.01135284, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00091124, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.6364508052460505, + "language_loss": 0.74032331, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76308405, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.6381688117980957 + }, + { + "auxiliary_loss_clip": 0.01142199, + "auxiliary_loss_mlp": 0.01135435, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.00087142, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 1.9994030323147687, + "language_loss": 0.78287184, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80564815, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.627230167388916 + }, + { + "auxiliary_loss_clip": 0.01141336, + "auxiliary_loss_mlp": 0.01135292, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00082421, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.0293639448279928, + "language_loss": 0.83642679, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.85919309, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.5636932849884033 + }, + { + "auxiliary_loss_clip": 0.0115673, + "auxiliary_loss_mlp": 0.0113578, + "balance_loss_clip": 1.00237393, + "balance_loss_mlp": 1.00083494, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4353897958183217, + "language_loss": 0.77145779, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79438293, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 3.939155340194702 + }, + { + "auxiliary_loss_clip": 0.01173122, + "auxiliary_loss_mlp": 0.01133997, + "balance_loss_clip": 1.00216043, + "balance_loss_mlp": 1.00057793, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.8873421458788995, + "language_loss": 0.79285067, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.8159219, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.5066938400268555 + }, + { + "auxiliary_loss_clip": 0.01156713, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.00095308, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 4.323998135932892, + "language_loss": 0.7289629, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75189567, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.545630931854248 + }, + { + "auxiliary_loss_clip": 0.01142935, + "auxiliary_loss_mlp": 0.01134879, + "balance_loss_clip": 1.00229979, + "balance_loss_mlp": 1.0006969, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.9756636664145362, + "language_loss": 0.72367907, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74645716, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.626931667327881 + }, + { + "auxiliary_loss_clip": 0.01141329, + "auxiliary_loss_mlp": 0.01135078, + "balance_loss_clip": 1.00215745, + "balance_loss_mlp": 1.00080073, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 2.1431884131162846, + "language_loss": 0.64328969, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66605377, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 3.987896203994751 + }, + { + "auxiliary_loss_clip": 0.01141231, + "auxiliary_loss_mlp": 0.01135691, + "balance_loss_clip": 1.00217009, + "balance_loss_mlp": 1.00084114, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.665472685767414, + "language_loss": 0.79537523, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81814444, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 4.039121866226196 + }, + { + "auxiliary_loss_clip": 0.01139453, + "auxiliary_loss_mlp": 0.01134703, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.0008074, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.7689137816920224, + "language_loss": 0.77949154, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.8022331, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.01156809, + "auxiliary_loss_mlp": 0.0113688, + "balance_loss_clip": 1.0023253, + "balance_loss_mlp": 1.00079048, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 2.254716651388118, + "language_loss": 0.74005395, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76299083, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.5034446716308594 + }, + { + "auxiliary_loss_clip": 0.01157636, + "auxiliary_loss_mlp": 0.01135512, + "balance_loss_clip": 1.00225723, + "balance_loss_mlp": 1.00075781, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 8.456372246364232, + "language_loss": 0.82877207, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85170358, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.5488836765289307 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.0113503, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00075293, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4534441739861987, + "language_loss": 0.78273475, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80548048, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.6199917793273926 + }, + { + "auxiliary_loss_clip": 0.01126112, + "auxiliary_loss_mlp": 0.01134945, + "balance_loss_clip": 1.00220168, + "balance_loss_mlp": 1.00085866, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.6395884592899082, + "language_loss": 0.84531176, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86792231, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.6338958740234375 + }, + { + "auxiliary_loss_clip": 0.01126288, + "auxiliary_loss_mlp": 0.01136194, + "balance_loss_clip": 1.00223219, + "balance_loss_mlp": 1.00096321, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 4.206486519838829, + "language_loss": 0.78557122, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80819607, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.6049764156341553 + }, + { + "auxiliary_loss_clip": 0.01156676, + "auxiliary_loss_mlp": 0.01135246, + "balance_loss_clip": 1.00221562, + "balance_loss_mlp": 1.00068235, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.5826404063054067, + "language_loss": 0.8584789, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88139808, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.5507028102874756 + }, + { + "auxiliary_loss_clip": 0.01142237, + "auxiliary_loss_mlp": 0.01135178, + "balance_loss_clip": 1.00224686, + "balance_loss_mlp": 1.00071013, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.6804381067355434, + "language_loss": 0.79864442, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82141852, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.593585729598999 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01135439, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.00097096, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 1.6271317032653991, + "language_loss": 0.67469674, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69730943, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.599480152130127 + }, + { + "auxiliary_loss_clip": 0.01157946, + "auxiliary_loss_mlp": 0.01135257, + "balance_loss_clip": 1.00222182, + "balance_loss_mlp": 1.00078857, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4495042943101584, + "language_loss": 0.69359702, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71652907, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.6098132133483887 + }, + { + "auxiliary_loss_clip": 0.01138762, + "auxiliary_loss_mlp": 0.01121435, + "balance_loss_clip": 1.00317192, + "balance_loss_mlp": 1.00012767, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7265095850859092, + "language_loss": 0.53041804, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55302, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.30322003364563 + }, + { + "auxiliary_loss_clip": 0.01173143, + "auxiliary_loss_mlp": 0.01134744, + "balance_loss_clip": 1.00222433, + "balance_loss_mlp": 1.00075293, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 2.0142974773386197, + "language_loss": 0.8422581, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86533695, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.4687821865081787 + }, + { + "auxiliary_loss_clip": 0.01157908, + "auxiliary_loss_mlp": 0.01134762, + "balance_loss_clip": 1.00220799, + "balance_loss_mlp": 1.00086606, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.561950448013055, + "language_loss": 0.63434076, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65726739, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.637096405029297 + }, + { + "auxiliary_loss_clip": 0.01147502, + "auxiliary_loss_mlp": 0.0113534, + "balance_loss_clip": 1.00263286, + "balance_loss_mlp": 1.00068104, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.8990591092601246, + "language_loss": 0.70418555, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72701395, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.58804988861084 + }, + { + "auxiliary_loss_clip": 0.0117354, + "auxiliary_loss_mlp": 0.00748191, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.0009197, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.603361276927104, + "language_loss": 0.8658402, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88505745, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.529240131378174 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01134384, + "balance_loss_clip": 1.00223029, + "balance_loss_mlp": 1.00058341, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7468624526086745, + "language_loss": 0.83500624, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85776466, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.6544315814971924 + }, + { + "auxiliary_loss_clip": 0.01162519, + "auxiliary_loss_mlp": 0.01134572, + "balance_loss_clip": 1.00250983, + "balance_loss_mlp": 1.00067616, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.7443769200420844, + "language_loss": 0.78516424, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80813515, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.5503060817718506 + }, + { + "auxiliary_loss_clip": 0.01142021, + "auxiliary_loss_mlp": 0.01134934, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00065684, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 2.042731898367203, + "language_loss": 0.80591714, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82868671, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.5902554988861084 + }, + { + "auxiliary_loss_clip": 0.01156822, + "auxiliary_loss_mlp": 0.01135448, + "balance_loss_clip": 1.00220561, + "balance_loss_mlp": 1.0006938, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.7758010475076789, + "language_loss": 0.74688631, + "learning_rate": 3.179631337655037e-06, + "loss": 0.76980901, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.6257779598236084 + }, + { + "auxiliary_loss_clip": 0.01127379, + "auxiliary_loss_mlp": 0.01134301, + "balance_loss_clip": 1.00228238, + "balance_loss_mlp": 1.00069106, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.5394207075787694, + "language_loss": 0.81129551, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83391225, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.678251266479492 + }, + { + "auxiliary_loss_clip": 0.01124508, + "auxiliary_loss_mlp": 0.01134291, + "balance_loss_clip": 1.00200355, + "balance_loss_mlp": 1.0006814, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5815169696792848, + "language_loss": 0.77902102, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80160904, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.653364419937134 + }, + { + "auxiliary_loss_clip": 0.01108576, + "auxiliary_loss_mlp": 0.01135651, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00080192, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.9115256048307, + "language_loss": 0.74303138, + "learning_rate": 3.178687621198524e-06, + "loss": 0.7654736, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.73938250541687 + }, + { + "auxiliary_loss_clip": 0.01141167, + "auxiliary_loss_mlp": 0.01134031, + "balance_loss_clip": 1.00220382, + "balance_loss_mlp": 1.00080252, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.5930752487092483, + "language_loss": 0.71413189, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73688388, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.5581812858581543 + }, + { + "auxiliary_loss_clip": 0.01108437, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00108516, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 2.0368770703347363, + "language_loss": 0.80081242, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82325995, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.7251698970794678 + }, + { + "auxiliary_loss_clip": 0.01155948, + "auxiliary_loss_mlp": 0.01121591, + "balance_loss_clip": 1.00320911, + "balance_loss_mlp": 1.00028408, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8289886259714038, + "language_loss": 0.57828861, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60106403, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.058825969696045 + }, + { + "auxiliary_loss_clip": 0.01125956, + "auxiliary_loss_mlp": 0.01135004, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00063181, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.466420486037966, + "language_loss": 0.73138303, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75399262, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.6963844299316406 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01135158, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00088072, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 2.133564493211578, + "language_loss": 0.70549369, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72825384, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 4.1745569705963135 + }, + { + "auxiliary_loss_clip": 0.01109661, + "auxiliary_loss_mlp": 0.01134261, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00065112, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 2.0103933505456952, + "language_loss": 0.77688414, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79932332, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.681292772293091 + }, + { + "auxiliary_loss_clip": 0.01156484, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00074434, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5520696441434534, + "language_loss": 0.68376863, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70668364, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.653113603591919 + }, + { + "auxiliary_loss_clip": 0.0112589, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00088823, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 8.512230469535526, + "language_loss": 0.79064453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81325603, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.620792865753174 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01133985, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.0007565, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.6427713145603184, + "language_loss": 0.73928815, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76219207, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.54712176322937 + }, + { + "auxiliary_loss_clip": 0.01146861, + "auxiliary_loss_mlp": 0.01134585, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.00068879, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 4.641893668106974, + "language_loss": 0.63013721, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.6529516, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 4.09149956703186 + }, + { + "auxiliary_loss_clip": 0.01173198, + "auxiliary_loss_mlp": 0.01135279, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00071537, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.9221147017589977, + "language_loss": 0.81801212, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84109688, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.5303843021392822 + }, + { + "auxiliary_loss_clip": 0.01131666, + "auxiliary_loss_mlp": 0.01135358, + "balance_loss_clip": 1.00226092, + "balance_loss_mlp": 1.00069928, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 2.1983166503443643, + "language_loss": 0.7621963, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78486651, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.632944345474243 + }, + { + "auxiliary_loss_clip": 0.01122752, + "auxiliary_loss_mlp": 0.01134181, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00066614, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.5666310652464324, + "language_loss": 0.78948021, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81204951, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.669341564178467 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01135787, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.0007472, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.5186841198818293, + "language_loss": 0.74776745, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77053452, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 3.999746084213257 + }, + { + "auxiliary_loss_clip": 0.01093029, + "auxiliary_loss_mlp": 0.01135001, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00081897, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.5194132750923628, + "language_loss": 0.82762086, + "learning_rate": 3.173963011408748e-06, + "loss": 0.8499012, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.7268764972686768 + }, + { + "auxiliary_loss_clip": 0.0110788, + "auxiliary_loss_mlp": 0.01134673, + "balance_loss_clip": 1.00169146, + "balance_loss_mlp": 1.00058663, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.073597032804195, + "language_loss": 0.79771918, + "learning_rate": 3.173647680842262e-06, + "loss": 0.82014465, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 4.206842660903931 + }, + { + "auxiliary_loss_clip": 0.01141004, + "auxiliary_loss_mlp": 0.01134894, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.00071216, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.757547560922129, + "language_loss": 0.83218122, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85494018, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.6358468532562256 + }, + { + "auxiliary_loss_clip": 0.01129935, + "auxiliary_loss_mlp": 0.01134645, + "balance_loss_clip": 1.00203788, + "balance_loss_mlp": 1.00065351, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.590298087430435, + "language_loss": 0.82088029, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84352607, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.6350417137145996 + }, + { + "auxiliary_loss_clip": 0.01158077, + "auxiliary_loss_mlp": 0.01134749, + "balance_loss_clip": 1.00219345, + "balance_loss_mlp": 1.00085282, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.950461934296444, + "language_loss": 0.7969889, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.81991714, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.529090166091919 + }, + { + "auxiliary_loss_clip": 0.01141174, + "auxiliary_loss_mlp": 0.01135697, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00103831, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.1584554411909616, + "language_loss": 0.85455692, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87732565, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.575235605239868 + }, + { + "auxiliary_loss_clip": 0.01143221, + "auxiliary_loss_mlp": 0.0113514, + "balance_loss_clip": 1.00236726, + "balance_loss_mlp": 1.00076747, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 3.039579180909904, + "language_loss": 0.80619645, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82898009, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.599435567855835 + }, + { + "auxiliary_loss_clip": 0.01156295, + "auxiliary_loss_mlp": 0.01134827, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00083625, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.5719736421607653, + "language_loss": 0.79912883, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82204008, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.596616506576538 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01135315, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.00103807, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.7302382080449155, + "language_loss": 0.75803292, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.78061175, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.622222661972046 + }, + { + "auxiliary_loss_clip": 0.01108785, + "auxiliary_loss_mlp": 0.01134636, + "balance_loss_clip": 1.00194371, + "balance_loss_mlp": 1.00074053, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.040094525471349, + "language_loss": 0.82007754, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84251177, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.667518377304077 + }, + { + "auxiliary_loss_clip": 0.0107447, + "auxiliary_loss_mlp": 0.011343, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.00059533, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.520210439232552, + "language_loss": 0.73001611, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75210381, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.7976996898651123 + }, + { + "auxiliary_loss_clip": 0.01124186, + "auxiliary_loss_mlp": 0.01134888, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00080192, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5584347739316247, + "language_loss": 0.83425784, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85684848, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.6717870235443115 + }, + { + "auxiliary_loss_clip": 0.01173312, + "auxiliary_loss_mlp": 0.01135549, + "balance_loss_clip": 1.00231099, + "balance_loss_mlp": 1.00089025, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.861097937452759, + "language_loss": 0.7135936, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73668218, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.4960498809814453 + }, + { + "auxiliary_loss_clip": 0.01109211, + "auxiliary_loss_mlp": 0.01136956, + "balance_loss_clip": 1.00214231, + "balance_loss_mlp": 1.00086713, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.9638632629669672, + "language_loss": 0.68531001, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70777166, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.679755687713623 + }, + { + "auxiliary_loss_clip": 0.01146196, + "auxiliary_loss_mlp": 0.01121281, + "balance_loss_clip": 1.00360322, + "balance_loss_mlp": 0.99997371, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7089380666031719, + "language_loss": 0.58252048, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60519528, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.287386894226074 + }, + { + "auxiliary_loss_clip": 0.01090944, + "auxiliary_loss_mlp": 0.01134625, + "balance_loss_clip": 1.00180614, + "balance_loss_mlp": 1.00072908, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.5803883468328879, + "language_loss": 0.83435452, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85661018, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.6960508823394775 + }, + { + "auxiliary_loss_clip": 0.01157515, + "auxiliary_loss_mlp": 0.01134464, + "balance_loss_clip": 1.00213349, + "balance_loss_mlp": 1.00075889, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 2.0547230918501627, + "language_loss": 0.79605997, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81897974, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.578779458999634 + }, + { + "auxiliary_loss_clip": 0.01154928, + "auxiliary_loss_mlp": 0.01121325, + "balance_loss_clip": 1.00266957, + "balance_loss_mlp": 1.00001764, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6749150014512946, + "language_loss": 0.56927079, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59203327, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 2.994431734085083 + }, + { + "auxiliary_loss_clip": 0.01091609, + "auxiliary_loss_mlp": 0.01135603, + "balance_loss_clip": 1.0019449, + "balance_loss_mlp": 1.00084925, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.909310576216627, + "language_loss": 0.71440232, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73667449, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.764688730239868 + }, + { + "auxiliary_loss_clip": 0.011578, + "auxiliary_loss_mlp": 0.01134784, + "balance_loss_clip": 1.00220609, + "balance_loss_mlp": 1.00088799, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.840145366592545, + "language_loss": 0.74388903, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76681489, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.6079366207122803 + }, + { + "auxiliary_loss_clip": 0.01156553, + "auxiliary_loss_mlp": 0.01134775, + "balance_loss_clip": 1.00205386, + "balance_loss_mlp": 1.00068808, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.073631662704346, + "language_loss": 0.76707971, + "learning_rate": 3.167647957801365e-06, + "loss": 0.78999299, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.550015926361084 + }, + { + "auxiliary_loss_clip": 0.01140569, + "auxiliary_loss_mlp": 0.01134791, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00079989, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.0046176234822912, + "language_loss": 0.77030134, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79305494, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.5661609172821045 + }, + { + "auxiliary_loss_clip": 0.0114068, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_clip": 1.00219595, + "balance_loss_mlp": 1.00081325, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5220821220888145, + "language_loss": 0.76704758, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78980529, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.6022517681121826 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_clip": 1.00210571, + "balance_loss_mlp": 1.00074196, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.9182630916177035, + "language_loss": 0.71699989, + "learning_rate": 3.166699169850055e-06, + "loss": 0.73975933, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.6091511249542236 + }, + { + "auxiliary_loss_clip": 0.01173012, + "auxiliary_loss_mlp": 0.01134178, + "balance_loss_clip": 1.00213063, + "balance_loss_mlp": 1.00075865, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 2.435257697610036, + "language_loss": 0.74509501, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.7681669, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.5414867401123047 + }, + { + "auxiliary_loss_clip": 0.01142941, + "auxiliary_loss_mlp": 0.01134532, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.00092196, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.6524845372633588, + "language_loss": 0.78642249, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80919719, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.707313299179077 + }, + { + "auxiliary_loss_clip": 0.0112467, + "auxiliary_loss_mlp": 0.01134863, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00058615, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 3.9119565118332766, + "language_loss": 0.83314967, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85574496, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.70646333694458 + }, + { + "auxiliary_loss_clip": 0.01173303, + "auxiliary_loss_mlp": 0.01134556, + "balance_loss_clip": 1.00233388, + "balance_loss_mlp": 1.00085092, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 26.274789251682712, + "language_loss": 0.83274436, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85582298, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 4.3096764087677 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.00748186, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00108433, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.110224101559595, + "language_loss": 0.88626993, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90531445, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.558985471725464 + }, + { + "auxiliary_loss_clip": 0.01173257, + "auxiliary_loss_mlp": 0.01135021, + "balance_loss_clip": 1.0022881, + "balance_loss_mlp": 1.00093412, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 2.6150635866066043, + "language_loss": 0.72877449, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75185728, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.5223565101623535 + }, + { + "auxiliary_loss_clip": 0.01141461, + "auxiliary_loss_mlp": 0.01134686, + "balance_loss_clip": 1.00218606, + "balance_loss_mlp": 1.00078976, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.6922962376102846, + "language_loss": 0.81017935, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83294076, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.5670254230499268 + }, + { + "auxiliary_loss_clip": 0.01124533, + "auxiliary_loss_mlp": 0.01134786, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00060415, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.250525356813442, + "language_loss": 0.87835503, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.90094829, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.666354179382324 + }, + { + "auxiliary_loss_clip": 0.01173249, + "auxiliary_loss_mlp": 0.01135383, + "balance_loss_clip": 1.00222337, + "balance_loss_mlp": 1.00072443, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 2.157619912661421, + "language_loss": 0.76202488, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78511113, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 3.890150547027588 + }, + { + "auxiliary_loss_clip": 0.0112571, + "auxiliary_loss_mlp": 0.01133935, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.0007062, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 2.5380919858134328, + "language_loss": 0.66937345, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69196993, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.640920400619507 + }, + { + "auxiliary_loss_clip": 0.01125677, + "auxiliary_loss_mlp": 0.01134942, + "balance_loss_clip": 1.00216627, + "balance_loss_mlp": 1.00095093, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.4742404730818244, + "language_loss": 0.72369361, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.7462998, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.6553187370300293 + }, + { + "auxiliary_loss_clip": 0.01156918, + "auxiliary_loss_mlp": 0.01136, + "balance_loss_clip": 1.00230694, + "balance_loss_mlp": 1.00067389, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9718630916234996, + "language_loss": 0.82425225, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84718144, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.6004273891448975 + }, + { + "auxiliary_loss_clip": 0.01162959, + "auxiliary_loss_mlp": 0.01135179, + "balance_loss_clip": 1.00303912, + "balance_loss_mlp": 1.00080681, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.513544437170156, + "language_loss": 0.78495693, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80793834, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.605477809906006 + }, + { + "auxiliary_loss_clip": 0.01156668, + "auxiliary_loss_mlp": 0.01135527, + "balance_loss_clip": 1.0019908, + "balance_loss_mlp": 1.00096333, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.619930791210263, + "language_loss": 0.76850009, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79142201, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 3.990652322769165 + }, + { + "auxiliary_loss_clip": 0.01156428, + "auxiliary_loss_mlp": 0.01134178, + "balance_loss_clip": 1.00223899, + "balance_loss_mlp": 1.00066364, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.7681807535203884, + "language_loss": 0.722583, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74548906, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 4.002071857452393 + }, + { + "auxiliary_loss_clip": 0.01147298, + "auxiliary_loss_mlp": 0.01134973, + "balance_loss_clip": 1.0026052, + "balance_loss_mlp": 1.00088596, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.280837816496684, + "language_loss": 0.70196426, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72478688, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.653960704803467 + }, + { + "auxiliary_loss_clip": 0.01156115, + "auxiliary_loss_mlp": 0.01134389, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00087428, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.165762097097504, + "language_loss": 0.78449571, + "learning_rate": 3.161315193285283e-06, + "loss": 0.8074007, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.5662882328033447 + }, + { + "auxiliary_loss_clip": 0.01092711, + "auxiliary_loss_mlp": 0.01135319, + "balance_loss_clip": 1.00186002, + "balance_loss_mlp": 1.00085115, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 1.903487040121792, + "language_loss": 0.75299704, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77527738, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.683560371398926 + }, + { + "auxiliary_loss_clip": 0.01139556, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00081491, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.7896463904127815, + "language_loss": 0.71943957, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74218607, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.666316032409668 + }, + { + "auxiliary_loss_clip": 0.0117313, + "auxiliary_loss_mlp": 0.01135367, + "balance_loss_clip": 1.00213087, + "balance_loss_mlp": 1.00080347, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.9960059224882223, + "language_loss": 0.94234788, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96543288, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.5284745693206787 + }, + { + "auxiliary_loss_clip": 0.01157457, + "auxiliary_loss_mlp": 0.01135797, + "balance_loss_clip": 1.0022471, + "balance_loss_mlp": 1.00085235, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 1.9036069464679846, + "language_loss": 0.7768594, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79979193, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.547924518585205 + }, + { + "auxiliary_loss_clip": 0.01141009, + "auxiliary_loss_mlp": 0.01135065, + "balance_loss_clip": 1.00207531, + "balance_loss_mlp": 1.00059688, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.5950556958429143, + "language_loss": 0.71769094, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74045163, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.728396415710449 + }, + { + "auxiliary_loss_clip": 0.01124091, + "auxiliary_loss_mlp": 0.01134954, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00067616, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 1.7090560914711546, + "language_loss": 0.81213617, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83472663, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.613560438156128 + }, + { + "auxiliary_loss_clip": 0.01143152, + "auxiliary_loss_mlp": 0.0113562, + "balance_loss_clip": 1.0024159, + "balance_loss_mlp": 1.00067556, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 1.7217840212779847, + "language_loss": 0.72846663, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75125432, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.6222517490386963 + }, + { + "auxiliary_loss_clip": 0.01141148, + "auxiliary_loss_mlp": 0.01134138, + "balance_loss_clip": 1.00212705, + "balance_loss_mlp": 1.00071895, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.5799294403853905, + "language_loss": 0.77312577, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79587865, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.5499355792999268 + }, + { + "auxiliary_loss_clip": 0.01140945, + "auxiliary_loss_mlp": 0.01134883, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00070107, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 1.7403077819585182, + "language_loss": 0.62763858, + "learning_rate": 3.158459696652067e-06, + "loss": 0.65039688, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.6371943950653076 + }, + { + "auxiliary_loss_clip": 0.0115631, + "auxiliary_loss_mlp": 0.01134457, + "balance_loss_clip": 1.00208187, + "balance_loss_mlp": 1.00075173, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.5991690019793092, + "language_loss": 0.83018965, + "learning_rate": 3.158142199443371e-06, + "loss": 0.85309732, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.573486566543579 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01134671, + "balance_loss_clip": 1.00232267, + "balance_loss_mlp": 1.00096583, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.7094998975484006, + "language_loss": 0.81950378, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.84225971, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.658426284790039 + }, + { + "auxiliary_loss_clip": 0.0115803, + "auxiliary_loss_mlp": 0.01134604, + "balance_loss_clip": 1.0024153, + "balance_loss_mlp": 1.00089848, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.455330692908247, + "language_loss": 0.83101463, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85394096, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.5648982524871826 + }, + { + "auxiliary_loss_clip": 0.01124635, + "auxiliary_loss_mlp": 0.01135418, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.0007596, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 2.0871585982252974, + "language_loss": 0.75780874, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.78040928, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.685818910598755 + }, + { + "auxiliary_loss_clip": 0.0112334, + "auxiliary_loss_mlp": 0.01134205, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00059485, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.003700984386381, + "language_loss": 0.66719282, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68976831, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.629688024520874 + }, + { + "auxiliary_loss_clip": 0.01141364, + "auxiliary_loss_mlp": 0.01134993, + "balance_loss_clip": 1.00232279, + "balance_loss_mlp": 1.00062072, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.5838309086455842, + "language_loss": 0.73157781, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75434136, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.6069467067718506 + }, + { + "auxiliary_loss_clip": 0.01124728, + "auxiliary_loss_mlp": 0.01134385, + "balance_loss_clip": 1.00228608, + "balance_loss_mlp": 1.00067961, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.0531942774428695, + "language_loss": 0.71129161, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73388273, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.6464955806732178 + }, + { + "auxiliary_loss_clip": 0.01157521, + "auxiliary_loss_mlp": 0.01135025, + "balance_loss_clip": 1.00217152, + "balance_loss_mlp": 1.00065207, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 2.5633723480793114, + "language_loss": 0.79337233, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81629777, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.6514995098114014 + }, + { + "auxiliary_loss_clip": 0.01142791, + "auxiliary_loss_mlp": 0.01135226, + "balance_loss_clip": 1.00221467, + "balance_loss_mlp": 1.00085354, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.7298655987285756, + "language_loss": 0.87841296, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.90119308, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.6120541095733643 + }, + { + "auxiliary_loss_clip": 0.01110448, + "auxiliary_loss_mlp": 0.01134316, + "balance_loss_clip": 1.00209105, + "balance_loss_mlp": 1.0007062, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 2.452756949701214, + "language_loss": 0.84911597, + "learning_rate": 3.155282749751332e-06, + "loss": 0.87156361, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.630652666091919 + }, + { + "auxiliary_loss_clip": 0.01142753, + "auxiliary_loss_mlp": 0.01133645, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00098884, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.2714559266324175, + "language_loss": 0.87331235, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89607638, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.6283628940582275 + }, + { + "auxiliary_loss_clip": 0.01156203, + "auxiliary_loss_mlp": 0.01134178, + "balance_loss_clip": 1.00210989, + "balance_loss_mlp": 1.00075901, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 2.40427660682355, + "language_loss": 0.72635806, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74926186, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.579721689224243 + }, + { + "auxiliary_loss_clip": 0.01123019, + "auxiliary_loss_mlp": 0.01134731, + "balance_loss_clip": 1.00226486, + "balance_loss_mlp": 1.00064421, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.7517436897912828, + "language_loss": 0.83045137, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85302889, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.68643856048584 + }, + { + "auxiliary_loss_clip": 0.01173099, + "auxiliary_loss_mlp": 0.0113464, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00064874, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.784765817142758, + "language_loss": 0.87726605, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90034348, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.494793653488159 + }, + { + "auxiliary_loss_clip": 0.01140573, + "auxiliary_loss_mlp": 0.01133845, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.00071239, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.3484447254966563, + "language_loss": 0.69881153, + "learning_rate": 3.153692632731479e-06, + "loss": 0.72155571, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.657043695449829 + }, + { + "auxiliary_loss_clip": 0.01156523, + "auxiliary_loss_mlp": 0.01134593, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00060129, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 2.139387671784071, + "language_loss": 0.77330244, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79621357, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 3.9894814491271973 + }, + { + "auxiliary_loss_clip": 0.01092891, + "auxiliary_loss_mlp": 0.01135015, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00092864, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.8268812789290396, + "language_loss": 0.83466959, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85694873, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.806647300720215 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01133922, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00069404, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.4749756032084658, + "language_loss": 0.70875275, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73116577, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.672586679458618 + }, + { + "auxiliary_loss_clip": 0.01113787, + "auxiliary_loss_mlp": 0.01134041, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00071716, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.7974010991956622, + "language_loss": 0.83205748, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85453582, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.691736936569214 + }, + { + "auxiliary_loss_clip": 0.01125192, + "auxiliary_loss_mlp": 0.01133782, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00055337, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.9024831198652188, + "language_loss": 0.80655646, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82914621, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.6564743518829346 + }, + { + "auxiliary_loss_clip": 0.01139816, + "auxiliary_loss_mlp": 0.01134159, + "balance_loss_clip": 1.00222254, + "balance_loss_mlp": 1.0006448, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.567884705807361, + "language_loss": 0.7676729, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79041266, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 3.9435925483703613 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01120983, + "balance_loss_clip": 1.00306988, + "balance_loss_mlp": 1.00043893, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9087881986609679, + "language_loss": 0.63916826, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.6616292, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.1425845623016357 + }, + { + "auxiliary_loss_clip": 0.01125889, + "auxiliary_loss_mlp": 0.01133651, + "balance_loss_clip": 1.00229275, + "balance_loss_mlp": 1.00051856, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.4328141320596586, + "language_loss": 0.74151719, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76411265, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.7424793243408203 + }, + { + "auxiliary_loss_clip": 0.01172642, + "auxiliary_loss_mlp": 0.01120936, + "balance_loss_clip": 1.00384188, + "balance_loss_mlp": 1.00039184, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7769993407036585, + "language_loss": 0.57874805, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60168386, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.185729503631592 + }, + { + "auxiliary_loss_clip": 0.01142229, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_clip": 1.00380063, + "balance_loss_mlp": 1.0003674, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9046284437713028, + "language_loss": 0.63467813, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65730953, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 3.2453675270080566 + }, + { + "auxiliary_loss_clip": 0.01146113, + "auxiliary_loss_mlp": 0.01133771, + "balance_loss_clip": 1.00257564, + "balance_loss_mlp": 1.00082839, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.127189687186481, + "language_loss": 0.69576579, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71856463, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 4.055747985839844 + }, + { + "auxiliary_loss_clip": 0.01162547, + "auxiliary_loss_mlp": 0.01134118, + "balance_loss_clip": 1.0026226, + "balance_loss_mlp": 1.00060391, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6999752971411628, + "language_loss": 0.77368599, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79665267, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 3.9590015411376953 + }, + { + "auxiliary_loss_clip": 0.01158021, + "auxiliary_loss_mlp": 0.00747984, + "balance_loss_clip": 1.00224435, + "balance_loss_mlp": 1.00095725, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.7410073224366844, + "language_loss": 0.8016125, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82067251, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.609384059906006 + }, + { + "auxiliary_loss_clip": 0.01172992, + "auxiliary_loss_mlp": 0.01133853, + "balance_loss_clip": 1.00225973, + "balance_loss_mlp": 1.00052953, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.6270205035493943, + "language_loss": 0.7552743, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77834272, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.5914008617401123 + }, + { + "auxiliary_loss_clip": 0.01125478, + "auxiliary_loss_mlp": 0.00747916, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00089622, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 1.8013691290418272, + "language_loss": 0.625054, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64378798, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.01141185, + "auxiliary_loss_mlp": 0.01132434, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00054145, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 2.6352490851789905, + "language_loss": 0.74760377, + "learning_rate": 3.148596916016224e-06, + "loss": 0.77033997, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.6243605613708496 + }, + { + "auxiliary_loss_clip": 0.01139657, + "auxiliary_loss_mlp": 0.01133084, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.0007143, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 2.8548394364246956, + "language_loss": 0.77081192, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79353935, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.614880323410034 + }, + { + "auxiliary_loss_clip": 0.01124456, + "auxiliary_loss_mlp": 0.01133832, + "balance_loss_clip": 1.00201607, + "balance_loss_mlp": 1.00060356, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 3.1221221851564005, + "language_loss": 0.78780806, + "learning_rate": 3.147959166423428e-06, + "loss": 0.81039095, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.665494203567505 + }, + { + "auxiliary_loss_clip": 0.0110973, + "auxiliary_loss_mlp": 0.01133175, + "balance_loss_clip": 1.00209534, + "balance_loss_mlp": 1.00061393, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.59254346509484, + "language_loss": 0.74250662, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76493561, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.7031819820404053 + }, + { + "auxiliary_loss_clip": 0.01125383, + "auxiliary_loss_mlp": 0.01134309, + "balance_loss_clip": 1.00202668, + "balance_loss_mlp": 1.00089025, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 2.0140017720177923, + "language_loss": 0.78907609, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81167299, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.6251049041748047 + }, + { + "auxiliary_loss_clip": 0.01156228, + "auxiliary_loss_mlp": 0.01133103, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.0007329, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.5664421784275784, + "language_loss": 0.71234447, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73523784, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.539559841156006 + }, + { + "auxiliary_loss_clip": 0.01122733, + "auxiliary_loss_mlp": 0.01133444, + "balance_loss_clip": 1.00193059, + "balance_loss_mlp": 1.00069225, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.6621327960795906, + "language_loss": 0.7876476, + "learning_rate": 3.146683144965881e-06, + "loss": 0.81020933, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.6073532104492188 + }, + { + "auxiliary_loss_clip": 0.01107176, + "auxiliary_loss_mlp": 0.01134612, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00081134, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.866008237889774, + "language_loss": 0.84097141, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86338931, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.6580097675323486 + }, + { + "auxiliary_loss_clip": 0.01156112, + "auxiliary_loss_mlp": 0.01133415, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00066352, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.5524877671337616, + "language_loss": 0.70665807, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72955334, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.5888564586639404 + }, + { + "auxiliary_loss_clip": 0.01109054, + "auxiliary_loss_mlp": 0.01133111, + "balance_loss_clip": 1.00188029, + "balance_loss_mlp": 1.00055027, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.474427162672732, + "language_loss": 0.84155548, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86397707, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.646359443664551 + }, + { + "auxiliary_loss_clip": 0.01139365, + "auxiliary_loss_mlp": 0.01132879, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00060415, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 3.24169446710762, + "language_loss": 0.85686791, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87959039, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.620666265487671 + }, + { + "auxiliary_loss_clip": 0.01142438, + "auxiliary_loss_mlp": 0.01133978, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00065482, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.992953889843484, + "language_loss": 0.87785089, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.9006151, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.6287665367126465 + }, + { + "auxiliary_loss_clip": 0.01172817, + "auxiliary_loss_mlp": 0.01132872, + "balance_loss_clip": 1.00215733, + "balance_loss_mlp": 1.00059724, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.394995206614983, + "language_loss": 0.76012385, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78318077, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.4693872928619385 + }, + { + "auxiliary_loss_clip": 0.0117288, + "auxiliary_loss_mlp": 0.01132505, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00042093, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 2.2303629735103776, + "language_loss": 0.72192812, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74498194, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.542448043823242 + }, + { + "auxiliary_loss_clip": 0.01130797, + "auxiliary_loss_mlp": 0.01133583, + "balance_loss_clip": 1.00250804, + "balance_loss_mlp": 1.00064063, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.603238520585708, + "language_loss": 0.6360203, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65866411, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.6873068809509277 + }, + { + "auxiliary_loss_clip": 0.01156019, + "auxiliary_loss_mlp": 0.0113341, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00065851, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.613876779201415, + "language_loss": 0.7482748, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.77116907, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.5976576805114746 + }, + { + "auxiliary_loss_clip": 0.01156232, + "auxiliary_loss_mlp": 0.01133673, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00082564, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.0831289422439174, + "language_loss": 0.74802685, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77092588, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.6058216094970703 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.00748105, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00093222, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9309547119468409, + "language_loss": 0.84700859, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86605108, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.5406100749969482 + }, + { + "auxiliary_loss_clip": 0.01157655, + "auxiliary_loss_mlp": 0.01133753, + "balance_loss_clip": 1.00216794, + "balance_loss_mlp": 1.00071549, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 2.346916613904597, + "language_loss": 0.86944366, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89235771, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.5558533668518066 + }, + { + "auxiliary_loss_clip": 0.01125834, + "auxiliary_loss_mlp": 0.01133893, + "balance_loss_clip": 1.0021708, + "balance_loss_mlp": 1.00056911, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.6749708688352642, + "language_loss": 0.77604252, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79863983, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.6521108150482178 + }, + { + "auxiliary_loss_clip": 0.01123477, + "auxiliary_loss_mlp": 0.00747908, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00084996, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 1.998467441012288, + "language_loss": 0.81124628, + "learning_rate": 3.142211596174343e-06, + "loss": 0.82996011, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.5943193435668945 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.01133039, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.00066912, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.348539636674068, + "language_loss": 0.59229314, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61470759, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.661482810974121 + }, + { + "auxiliary_loss_clip": 0.01156184, + "auxiliary_loss_mlp": 0.01133573, + "balance_loss_clip": 1.00225794, + "balance_loss_mlp": 1.00063109, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.1559339660658057, + "language_loss": 0.88595569, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90885323, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 3.922499895095825 + }, + { + "auxiliary_loss_clip": 0.01142782, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.002262, + "balance_loss_mlp": 1.00071955, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.6762476051007842, + "language_loss": 0.78844118, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81121707, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.621206045150757 + }, + { + "auxiliary_loss_clip": 0.01141136, + "auxiliary_loss_mlp": 0.00748204, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.00102639, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.8530650445439025, + "language_loss": 0.73277003, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75166345, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.5897185802459717 + }, + { + "auxiliary_loss_clip": 0.0117299, + "auxiliary_loss_mlp": 0.0113325, + "balance_loss_clip": 1.00229585, + "balance_loss_mlp": 1.00068915, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.4537123569175912, + "language_loss": 0.66930687, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69236934, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.574145555496216 + }, + { + "auxiliary_loss_clip": 0.01124063, + "auxiliary_loss_mlp": 0.01132802, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00062323, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.475391490984211, + "language_loss": 0.65679622, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67936492, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.674933433532715 + }, + { + "auxiliary_loss_clip": 0.01162318, + "auxiliary_loss_mlp": 0.01133205, + "balance_loss_clip": 1.00240326, + "balance_loss_mlp": 1.00064397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.711340709778617, + "language_loss": 0.77156496, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79452014, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.6127519607543945 + }, + { + "auxiliary_loss_clip": 0.01156062, + "auxiliary_loss_mlp": 0.01133922, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.0007894, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.3192820392444187, + "language_loss": 0.71029741, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.73319721, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 4.106542587280273 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01132708, + "balance_loss_clip": 1.00199258, + "balance_loss_mlp": 1.00052857, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.5806232293331375, + "language_loss": 0.78741288, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.81014657, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.6369829177856445 + }, + { + "auxiliary_loss_clip": 0.0115778, + "auxiliary_loss_mlp": 0.01133087, + "balance_loss_clip": 1.00225234, + "balance_loss_mlp": 1.00052583, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 3.2858797112655904, + "language_loss": 0.74853975, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77144837, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.617835521697998 + }, + { + "auxiliary_loss_clip": 0.01110693, + "auxiliary_loss_mlp": 0.0113234, + "balance_loss_clip": 1.00202286, + "balance_loss_mlp": 1.00063729, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 1.951841997375803, + "language_loss": 0.77274662, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79517692, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.6372158527374268 + }, + { + "auxiliary_loss_clip": 0.01162255, + "auxiliary_loss_mlp": 0.0113334, + "balance_loss_clip": 1.00246906, + "balance_loss_mlp": 1.00058818, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.7303631901508387, + "language_loss": 0.74073935, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76369536, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.5913093090057373 + }, + { + "auxiliary_loss_clip": 0.01172854, + "auxiliary_loss_mlp": 0.01133662, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00081503, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.48609941003892, + "language_loss": 0.78424507, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80731022, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 5.3494837284088135 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.0113394, + "balance_loss_clip": 1.00192487, + "balance_loss_mlp": 1.0007118, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 1.8699766927269115, + "language_loss": 0.78866512, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81124675, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.6338393688201904 + }, + { + "auxiliary_loss_clip": 0.01156515, + "auxiliary_loss_mlp": 0.01133227, + "balance_loss_clip": 1.00227261, + "balance_loss_mlp": 1.0006659, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 1.654022614217806, + "language_loss": 0.72757709, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75047457, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.560460329055786 + }, + { + "auxiliary_loss_clip": 0.01139427, + "auxiliary_loss_mlp": 0.01133408, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00075197, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8541915025731996, + "language_loss": 0.84076387, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86349225, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.662742853164673 + }, + { + "auxiliary_loss_clip": 0.01172798, + "auxiliary_loss_mlp": 0.01133573, + "balance_loss_clip": 1.00217116, + "balance_loss_mlp": 1.00082183, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7290028285223347, + "language_loss": 0.76599514, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78905886, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.536539077758789 + }, + { + "auxiliary_loss_clip": 0.01157819, + "auxiliary_loss_mlp": 0.01133399, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.00083828, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 2.425293055605875, + "language_loss": 0.63262433, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65553653, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.679569721221924 + }, + { + "auxiliary_loss_clip": 0.01172756, + "auxiliary_loss_mlp": 0.00748021, + "balance_loss_clip": 1.00217533, + "balance_loss_mlp": 1.00091982, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.4420790739939082, + "language_loss": 0.78316891, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80237663, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.5662436485290527 + }, + { + "auxiliary_loss_clip": 0.01124864, + "auxiliary_loss_mlp": 0.01133383, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.00063169, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.164960852966008, + "language_loss": 0.69930041, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72188288, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.6117360591888428 + }, + { + "auxiliary_loss_clip": 0.01156479, + "auxiliary_loss_mlp": 0.01133582, + "balance_loss_clip": 1.00221169, + "balance_loss_mlp": 1.00073552, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6377331496196055, + "language_loss": 0.72568226, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74858284, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.588247537612915 + }, + { + "auxiliary_loss_clip": 0.01141176, + "auxiliary_loss_mlp": 0.01133269, + "balance_loss_clip": 1.0022161, + "balance_loss_mlp": 1.0007081, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.5159924785641097, + "language_loss": 0.83380228, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.8565467, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.627568483352661 + }, + { + "auxiliary_loss_clip": 0.01140209, + "auxiliary_loss_mlp": 0.01133393, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00064182, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.7333330110771559, + "language_loss": 0.79290211, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81563818, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.6164021492004395 + }, + { + "auxiliary_loss_clip": 0.01139225, + "auxiliary_loss_mlp": 0.01133377, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00053072, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.7196335478721585, + "language_loss": 0.74728984, + "learning_rate": 3.134526351787587e-06, + "loss": 0.77001584, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.6313445568084717 + }, + { + "auxiliary_loss_clip": 0.01142895, + "auxiliary_loss_mlp": 0.01134534, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00073349, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.976008754499609, + "language_loss": 0.79017502, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8129493, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.560453176498413 + }, + { + "auxiliary_loss_clip": 0.0112981, + "auxiliary_loss_mlp": 0.01133323, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00066733, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.6531044166783417, + "language_loss": 0.81796306, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84059441, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.602952718734741 + }, + { + "auxiliary_loss_clip": 0.01172873, + "auxiliary_loss_mlp": 0.01133539, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00078714, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.8476225768273775, + "language_loss": 0.67787796, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70094204, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.730666160583496 + }, + { + "auxiliary_loss_clip": 0.01172961, + "auxiliary_loss_mlp": 0.01134732, + "balance_loss_clip": 1.00223732, + "balance_loss_mlp": 1.00074124, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.8160128221527299, + "language_loss": 0.65133667, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67441362, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.5873541831970215 + }, + { + "auxiliary_loss_clip": 0.01156317, + "auxiliary_loss_mlp": 0.01134878, + "balance_loss_clip": 1.00222564, + "balance_loss_mlp": 1.00088632, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6113604828336228, + "language_loss": 0.88290226, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90581417, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.5372493267059326 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01134184, + "balance_loss_clip": 1.00205839, + "balance_loss_mlp": 1.00066972, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 3.8511002843560065, + "language_loss": 0.78021336, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80280465, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.6310441493988037 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.01120071, + "balance_loss_clip": 1.0029335, + "balance_loss_mlp": 1.0002898, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8011162221739029, + "language_loss": 0.60152096, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62410331, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.138202667236328 + }, + { + "auxiliary_loss_clip": 0.01125859, + "auxiliary_loss_mlp": 0.01135226, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00094891, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.4158457718320525, + "language_loss": 0.77011192, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79272282, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.662903070449829 + }, + { + "auxiliary_loss_clip": 0.01125312, + "auxiliary_loss_mlp": 0.0113483, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00093436, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 2.831595170433737, + "language_loss": 0.74486828, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76746976, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.621657371520996 + }, + { + "auxiliary_loss_clip": 0.01172854, + "auxiliary_loss_mlp": 0.01133576, + "balance_loss_clip": 1.00223958, + "balance_loss_mlp": 1.00082493, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.329857763604358, + "language_loss": 0.75745118, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78051555, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.5091779232025146 + }, + { + "auxiliary_loss_clip": 0.01157704, + "auxiliary_loss_mlp": 0.01133739, + "balance_loss_clip": 1.00231886, + "balance_loss_mlp": 1.00070095, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.7064552396983959, + "language_loss": 0.8053565, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82827091, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.5580594539642334 + }, + { + "auxiliary_loss_clip": 0.01140363, + "auxiliary_loss_mlp": 0.01119878, + "balance_loss_clip": 1.00339603, + "balance_loss_mlp": 1.0000968, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7483716375481364, + "language_loss": 0.56506205, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58766448, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.2469348907470703 + }, + { + "auxiliary_loss_clip": 0.01157683, + "auxiliary_loss_mlp": 0.00748081, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00098133, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 2.334736571040692, + "language_loss": 0.77228498, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79134262, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.5791983604431152 + }, + { + "auxiliary_loss_clip": 0.01139445, + "auxiliary_loss_mlp": 0.01133456, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00080037, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.5590791851964745, + "language_loss": 0.78519833, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80792737, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.6221330165863037 + }, + { + "auxiliary_loss_clip": 0.01157369, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00079429, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 2.7672084840650513, + "language_loss": 0.74175167, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76467031, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.536780834197998 + }, + { + "auxiliary_loss_clip": 0.01157541, + "auxiliary_loss_mlp": 0.01133916, + "balance_loss_clip": 1.00229156, + "balance_loss_mlp": 1.00078297, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 2.2198963523853514, + "language_loss": 0.75582039, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77873492, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 4.02324914932251 + }, + { + "auxiliary_loss_clip": 0.01173001, + "auxiliary_loss_mlp": 0.01133751, + "balance_loss_clip": 1.00231123, + "balance_loss_mlp": 1.00080907, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 1.684055398103515, + "language_loss": 0.71678138, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73984897, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.483391523361206 + }, + { + "auxiliary_loss_clip": 0.01090658, + "auxiliary_loss_mlp": 0.01133833, + "balance_loss_clip": 1.00195563, + "balance_loss_mlp": 1.00079525, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.6912674809344979, + "language_loss": 0.80213022, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82437509, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.7864785194396973 + }, + { + "auxiliary_loss_clip": 0.01140722, + "auxiliary_loss_mlp": 0.01133992, + "balance_loss_clip": 1.00204957, + "balance_loss_mlp": 1.00076413, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 1.8726858389645449, + "language_loss": 0.8433491, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.8660962, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.6185381412506104 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01134372, + "balance_loss_clip": 1.00203097, + "balance_loss_mlp": 1.00076246, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 1.9714347964127108, + "language_loss": 0.74053013, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76297355, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.643472671508789 + }, + { + "auxiliary_loss_clip": 0.01172936, + "auxiliary_loss_mlp": 0.01134568, + "balance_loss_clip": 1.00227988, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.5429668635021248, + "language_loss": 0.72493434, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74800938, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.497297525405884 + }, + { + "auxiliary_loss_clip": 0.01172866, + "auxiliary_loss_mlp": 0.01133666, + "balance_loss_clip": 1.00220203, + "balance_loss_mlp": 1.00062895, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.9719276915682693, + "language_loss": 0.88894087, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91200614, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 3.9124770164489746 + }, + { + "auxiliary_loss_clip": 0.01156885, + "auxiliary_loss_mlp": 0.01132834, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00065482, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9006891396032282, + "language_loss": 0.83474123, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85763836, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.5159621238708496 + }, + { + "auxiliary_loss_clip": 0.01139813, + "auxiliary_loss_mlp": 0.01133399, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00083852, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.082732855739521, + "language_loss": 0.77312064, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79585278, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.6167452335357666 + }, + { + "auxiliary_loss_clip": 0.01173027, + "auxiliary_loss_mlp": 0.01134642, + "balance_loss_clip": 1.00226998, + "balance_loss_mlp": 1.00093675, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 1.8851097300730844, + "language_loss": 0.74548775, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76856446, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.4959523677825928 + }, + { + "auxiliary_loss_clip": 0.01108058, + "auxiliary_loss_mlp": 0.01118992, + "balance_loss_clip": 1.00282216, + "balance_loss_mlp": 0.99997348, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7825677416598911, + "language_loss": 0.53936732, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56163788, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.2661824226379395 + }, + { + "auxiliary_loss_clip": 0.01142327, + "auxiliary_loss_mlp": 0.01134607, + "balance_loss_clip": 1.00213873, + "balance_loss_mlp": 1.0009017, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.8051707210329, + "language_loss": 0.87038577, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.8931551, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 5.5231733322143555 + }, + { + "auxiliary_loss_clip": 0.01123314, + "auxiliary_loss_mlp": 0.01134368, + "balance_loss_clip": 1.0017308, + "balance_loss_mlp": 1.00085378, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 1.9980965374572828, + "language_loss": 0.73464996, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75722682, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.7012078762054443 + }, + { + "auxiliary_loss_clip": 0.01140971, + "auxiliary_loss_mlp": 0.01133808, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00067472, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 1.9558175908126103, + "language_loss": 0.72091466, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74366248, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.5878474712371826 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01134011, + "balance_loss_clip": 1.002092, + "balance_loss_mlp": 1.00078237, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 1.835847945814309, + "language_loss": 0.80129915, + "learning_rate": 3.124884968794321e-06, + "loss": 0.8240481, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.663179636001587 + }, + { + "auxiliary_loss_clip": 0.01157589, + "auxiliary_loss_mlp": 0.01134164, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00064957, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1338914464432843, + "language_loss": 0.75967014, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78258765, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.5326459407806396 + }, + { + "auxiliary_loss_clip": 0.01140487, + "auxiliary_loss_mlp": 0.01133545, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00069809, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.4692763515916387, + "language_loss": 0.79017258, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81291288, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.6318438053131104 + }, + { + "auxiliary_loss_clip": 0.01155725, + "auxiliary_loss_mlp": 0.01134313, + "balance_loss_clip": 1.00209534, + "balance_loss_mlp": 1.00060761, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.001622895566306, + "language_loss": 0.66377419, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68667459, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.6493399143218994 + }, + { + "auxiliary_loss_clip": 0.01156218, + "auxiliary_loss_mlp": 0.01134361, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00065589, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.088683480103592, + "language_loss": 0.7741524, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79705822, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.508815288543701 + }, + { + "auxiliary_loss_clip": 0.01139916, + "auxiliary_loss_mlp": 0.01135206, + "balance_loss_clip": 1.00231647, + "balance_loss_mlp": 1.0006423, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.5281387910484936, + "language_loss": 0.72302842, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74577951, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.6154820919036865 + }, + { + "auxiliary_loss_clip": 0.01140674, + "auxiliary_loss_mlp": 0.0113384, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00070667, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.4921545571659762, + "language_loss": 0.75298184, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77572697, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.6556482315063477 + }, + { + "auxiliary_loss_clip": 0.01145336, + "auxiliary_loss_mlp": 0.01133653, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00071061, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.8130360021870127, + "language_loss": 0.70080966, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72359955, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.647955894470215 + }, + { + "auxiliary_loss_clip": 0.01157695, + "auxiliary_loss_mlp": 0.01134055, + "balance_loss_clip": 1.00233936, + "balance_loss_mlp": 1.00101781, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.6318360639698848, + "language_loss": 0.82235754, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84527504, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.60734486579895 + }, + { + "auxiliary_loss_clip": 0.01157949, + "auxiliary_loss_mlp": 0.01134458, + "balance_loss_clip": 1.00232589, + "balance_loss_mlp": 1.00075293, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.911430741013394, + "language_loss": 0.79245937, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81538343, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.585545301437378 + }, + { + "auxiliary_loss_clip": 0.01139339, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_clip": 1.00198662, + "balance_loss_mlp": 1.0010345, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6059663779565538, + "language_loss": 0.71622264, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73895299, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.6227805614471436 + }, + { + "auxiliary_loss_clip": 0.01141162, + "auxiliary_loss_mlp": 0.01133596, + "balance_loss_clip": 1.00230515, + "balance_loss_mlp": 1.00084496, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.8457715815703877, + "language_loss": 0.71631134, + "learning_rate": 3.12134015873989e-06, + "loss": 0.73905891, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.635706901550293 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01135209, + "balance_loss_clip": 1.00242257, + "balance_loss_mlp": 1.00074089, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5986768737663812, + "language_loss": 0.73410314, + "learning_rate": 3.121017647907921e-06, + "loss": 0.7570219, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.61432147026062 + }, + { + "auxiliary_loss_clip": 0.01131838, + "auxiliary_loss_mlp": 0.01133584, + "balance_loss_clip": 1.00215697, + "balance_loss_mlp": 1.00102305, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.3041734256534725, + "language_loss": 0.88274324, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90539742, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.580443859100342 + }, + { + "auxiliary_loss_clip": 0.01108455, + "auxiliary_loss_mlp": 0.01132805, + "balance_loss_clip": 1.00189912, + "balance_loss_mlp": 1.00072157, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.8353966661882712, + "language_loss": 0.73412299, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75653553, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.656193971633911 + }, + { + "auxiliary_loss_clip": 0.01124347, + "auxiliary_loss_mlp": 0.01133275, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00080943, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.6735540359516357, + "language_loss": 0.72200102, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74457729, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.7407002449035645 + }, + { + "auxiliary_loss_clip": 0.01123651, + "auxiliary_loss_mlp": 0.01133515, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00066793, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.848159871309519, + "language_loss": 0.68847233, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.71104401, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.5989246368408203 + }, + { + "auxiliary_loss_clip": 0.0114091, + "auxiliary_loss_mlp": 0.01133983, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00085008, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.046252595128116, + "language_loss": 0.66193199, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68468094, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.6050827503204346 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.01133888, + "balance_loss_clip": 1.00218928, + "balance_loss_mlp": 1.00085044, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.7027939637295155, + "language_loss": 0.6912514, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71415174, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.6012208461761475 + }, + { + "auxiliary_loss_clip": 0.01156196, + "auxiliary_loss_mlp": 0.01134345, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00092554, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.1384909526942035, + "language_loss": 0.80633044, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82923591, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.57196307182312 + }, + { + "auxiliary_loss_clip": 0.01157281, + "auxiliary_loss_mlp": 0.01133587, + "balance_loss_clip": 1.00217962, + "balance_loss_mlp": 1.00083601, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.70158142894943, + "language_loss": 0.7467891, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76969779, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.547309160232544 + }, + { + "auxiliary_loss_clip": 0.01140607, + "auxiliary_loss_mlp": 0.01119307, + "balance_loss_clip": 1.00364876, + "balance_loss_mlp": 1.00028825, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6181079287866922, + "language_loss": 0.54326153, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56586069, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.2682647705078125 + }, + { + "auxiliary_loss_clip": 0.01156314, + "auxiliary_loss_mlp": 0.01133782, + "balance_loss_clip": 1.0021975, + "balance_loss_mlp": 1.00093532, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.172957369265051, + "language_loss": 0.78253835, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80543935, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.5541892051696777 + }, + { + "auxiliary_loss_clip": 0.01140707, + "auxiliary_loss_mlp": 0.01132921, + "balance_loss_clip": 1.00227034, + "balance_loss_mlp": 1.00074196, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 1.7311970368472125, + "language_loss": 0.76662767, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78936398, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.676687717437744 + }, + { + "auxiliary_loss_clip": 0.01156096, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00099897, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 2.2547505649671153, + "language_loss": 0.70133972, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72423625, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 4.030909538269043 + }, + { + "auxiliary_loss_clip": 0.01140555, + "auxiliary_loss_mlp": 0.01132908, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00082397, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.9290612425120541, + "language_loss": 0.73841888, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76115352, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.611283302307129 + }, + { + "auxiliary_loss_clip": 0.01140635, + "auxiliary_loss_mlp": 0.01133136, + "balance_loss_clip": 1.00203192, + "balance_loss_mlp": 1.0007658, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.62401190738778, + "language_loss": 0.81975484, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84249252, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.597231149673462 + }, + { + "auxiliary_loss_clip": 0.01122243, + "auxiliary_loss_mlp": 0.00747995, + "balance_loss_clip": 1.00166321, + "balance_loss_mlp": 1.00093758, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.9781985952957015, + "language_loss": 0.83069074, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84939313, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.614194393157959 + }, + { + "auxiliary_loss_clip": 0.01172151, + "auxiliary_loss_mlp": 0.01119287, + "balance_loss_clip": 1.0038172, + "balance_loss_mlp": 1.00026822, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7790338647428396, + "language_loss": 0.5257529, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54866719, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.0840415954589844 + }, + { + "auxiliary_loss_clip": 0.01126045, + "auxiliary_loss_mlp": 0.00748129, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00104308, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.0941935206613485, + "language_loss": 0.77600181, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79474354, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 3.9838409423828125 + }, + { + "auxiliary_loss_clip": 0.01108013, + "auxiliary_loss_mlp": 0.01132503, + "balance_loss_clip": 1.00186932, + "balance_loss_mlp": 1.0008961, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.9491793969098767, + "language_loss": 0.72146368, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74386883, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.666686773300171 + }, + { + "auxiliary_loss_clip": 0.01140346, + "auxiliary_loss_mlp": 0.01132416, + "balance_loss_clip": 1.00204957, + "balance_loss_mlp": 1.00061822, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 2.0043654826126316, + "language_loss": 0.82993871, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.85266638, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.5621254444122314 + }, + { + "auxiliary_loss_clip": 0.01140838, + "auxiliary_loss_mlp": 0.00747949, + "balance_loss_clip": 1.00207531, + "balance_loss_mlp": 1.00080216, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.8967711116053845, + "language_loss": 0.70143771, + "learning_rate": 3.114558520634423e-06, + "loss": 0.72032559, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.601172924041748 + }, + { + "auxiliary_loss_clip": 0.01157771, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.2120039439378916, + "language_loss": 0.76177371, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78468335, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.553117513656616 + }, + { + "auxiliary_loss_clip": 0.01139329, + "auxiliary_loss_mlp": 0.01134042, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00081396, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.9333339803179814, + "language_loss": 0.73392034, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75665402, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.653806686401367 + }, + { + "auxiliary_loss_clip": 0.01138698, + "auxiliary_loss_mlp": 0.01132857, + "balance_loss_clip": 1.00215483, + "balance_loss_mlp": 1.00077343, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 1.8126906912102891, + "language_loss": 0.66227674, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68499231, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 5.527487516403198 + }, + { + "auxiliary_loss_clip": 0.01091671, + "auxiliary_loss_mlp": 0.01132463, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.00076079, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.6298036709958303, + "language_loss": 0.71660805, + "learning_rate": 3.113264663362451e-06, + "loss": 0.7388494, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.7230045795440674 + }, + { + "auxiliary_loss_clip": 0.0113088, + "auxiliary_loss_mlp": 0.01134076, + "balance_loss_clip": 1.00263715, + "balance_loss_mlp": 1.00113416, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.4744654594530806, + "language_loss": 0.67372, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69636953, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.6666202545166016 + }, + { + "auxiliary_loss_clip": 0.01157556, + "auxiliary_loss_mlp": 0.00747985, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00092995, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 1.9516640463429729, + "language_loss": 0.73048437, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74953985, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.5912363529205322 + }, + { + "auxiliary_loss_clip": 0.01155958, + "auxiliary_loss_mlp": 0.011324, + "balance_loss_clip": 1.00219882, + "balance_loss_mlp": 1.00088847, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.646855250796994, + "language_loss": 0.81820446, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84108806, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.579765796661377 + }, + { + "auxiliary_loss_clip": 0.01156019, + "auxiliary_loss_mlp": 0.01133764, + "balance_loss_clip": 1.00217354, + "balance_loss_mlp": 1.0007267, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.6803924821891127, + "language_loss": 0.71885037, + "learning_rate": 3.111970130648789e-06, + "loss": 0.74174827, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.6386373043060303 + }, + { + "auxiliary_loss_clip": 0.01155874, + "auxiliary_loss_mlp": 0.01132379, + "balance_loss_clip": 1.00216067, + "balance_loss_mlp": 1.00077224, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.7889155048997234, + "language_loss": 0.74362803, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76651061, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.5606110095977783 + }, + { + "auxiliary_loss_clip": 0.01172785, + "auxiliary_loss_mlp": 0.01133352, + "balance_loss_clip": 1.00222647, + "balance_loss_mlp": 1.00069559, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 3.0303283735397653, + "language_loss": 0.71440727, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73746866, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.4752378463745117 + }, + { + "auxiliary_loss_clip": 0.01155859, + "auxiliary_loss_mlp": 0.01132986, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00061584, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.4554727306051234, + "language_loss": 0.60869968, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.6315881, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.695603609085083 + }, + { + "auxiliary_loss_clip": 0.01140024, + "auxiliary_loss_mlp": 0.01133028, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.0007534, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7381668761597728, + "language_loss": 0.68933105, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.71206158, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.5921096801757812 + }, + { + "auxiliary_loss_clip": 0.01157125, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_clip": 1.0021888, + "balance_loss_mlp": 1.00074625, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.5545354267165319, + "language_loss": 0.75196886, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77487129, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.523700714111328 + }, + { + "auxiliary_loss_clip": 0.01075485, + "auxiliary_loss_mlp": 0.01134122, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00098908, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.9241771902372917, + "language_loss": 0.75075495, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77285099, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.77697491645813 + }, + { + "auxiliary_loss_clip": 0.011727, + "auxiliary_loss_mlp": 0.0113319, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00062907, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.64639296481475, + "language_loss": 0.70792305, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73098195, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.542377233505249 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.0113315, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00077999, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.601866109081005, + "language_loss": 0.6920349, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71465522, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.5939300060272217 + }, + { + "auxiliary_loss_clip": 0.01123905, + "auxiliary_loss_mlp": 0.01133783, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.0008409, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 2.1571992608810504, + "language_loss": 0.6488139, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.67139077, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.6897644996643066 + }, + { + "auxiliary_loss_clip": 0.0114187, + "auxiliary_loss_mlp": 0.01132641, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00065231, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.8439902537676236, + "language_loss": 0.85436338, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87710845, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.5825345516204834 + }, + { + "auxiliary_loss_clip": 0.01155429, + "auxiliary_loss_mlp": 0.01133204, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00073814, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.929737317912296, + "language_loss": 0.74641198, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76929832, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.7325680255889893 + }, + { + "auxiliary_loss_clip": 0.01157549, + "auxiliary_loss_mlp": 0.01132793, + "balance_loss_clip": 1.00226605, + "balance_loss_mlp": 1.00070858, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.7121351414526946, + "language_loss": 0.68802571, + "learning_rate": 3.108082487713921e-06, + "loss": 0.7109291, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.768372058868408 + }, + { + "auxiliary_loss_clip": 0.01125799, + "auxiliary_loss_mlp": 0.01133845, + "balance_loss_clip": 1.00225997, + "balance_loss_mlp": 1.00080729, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.9482151406340358, + "language_loss": 0.60523176, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62782818, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.6458797454833984 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01133936, + "balance_loss_clip": 1.00221336, + "balance_loss_mlp": 1.00080299, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6091000291804525, + "language_loss": 0.70360714, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72619176, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.6048429012298584 + }, + { + "auxiliary_loss_clip": 0.01125575, + "auxiliary_loss_mlp": 0.01132961, + "balance_loss_clip": 1.00208962, + "balance_loss_mlp": 1.00068665, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 1.9393453330686887, + "language_loss": 0.82973588, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85232121, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.604595899581909 + }, + { + "auxiliary_loss_clip": 0.0113888, + "auxiliary_loss_mlp": 0.00748068, + "balance_loss_clip": 1.00205791, + "balance_loss_mlp": 1.00114465, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.2643074833581087, + "language_loss": 0.80754936, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82641888, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.591102361679077 + }, + { + "auxiliary_loss_clip": 0.01156072, + "auxiliary_loss_mlp": 0.01133192, + "balance_loss_clip": 1.00222754, + "balance_loss_mlp": 1.00082195, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4437416808952552, + "language_loss": 0.81500238, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83789504, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.601130247116089 + }, + { + "auxiliary_loss_clip": 0.01157514, + "auxiliary_loss_mlp": 0.01132533, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.00063944, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.8687814596014307, + "language_loss": 0.74605352, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76895392, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.6208643913269043 + }, + { + "auxiliary_loss_clip": 0.01156114, + "auxiliary_loss_mlp": 0.01132409, + "balance_loss_clip": 1.00226068, + "balance_loss_mlp": 1.00080204, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.5790989739526946, + "language_loss": 0.82504821, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84793347, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.5724501609802246 + }, + { + "auxiliary_loss_clip": 0.01139373, + "auxiliary_loss_mlp": 0.01133126, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00066066, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.4345325940518352, + "language_loss": 0.80240154, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82512653, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.6715753078460693 + }, + { + "auxiliary_loss_clip": 0.01140972, + "auxiliary_loss_mlp": 0.01132409, + "balance_loss_clip": 1.00220263, + "balance_loss_mlp": 1.00061083, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.9483001417614239, + "language_loss": 0.82202482, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84475869, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 4.073791027069092 + }, + { + "auxiliary_loss_clip": 0.01123828, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_clip": 1.00214744, + "balance_loss_mlp": 1.00077391, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 1.9955555687621371, + "language_loss": 0.71686375, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73943055, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0114056, + "auxiliary_loss_mlp": 0.01133997, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00105464, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.47088128054429, + "language_loss": 0.75120789, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77395344, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.6830968856811523 + }, + { + "auxiliary_loss_clip": 0.01140835, + "auxiliary_loss_mlp": 0.01133369, + "balance_loss_clip": 1.00225544, + "balance_loss_mlp": 1.00080776, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.6740277399486807, + "language_loss": 0.69540375, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71814585, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.5813186168670654 + }, + { + "auxiliary_loss_clip": 0.01155875, + "auxiliary_loss_mlp": 0.01132351, + "balance_loss_clip": 1.00219667, + "balance_loss_mlp": 1.00083935, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5227366573122474, + "language_loss": 0.65081918, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67370141, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.6079440116882324 + }, + { + "auxiliary_loss_clip": 0.01090545, + "auxiliary_loss_mlp": 0.01133212, + "balance_loss_clip": 1.00173378, + "balance_loss_mlp": 1.00084162, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 1.4039031432609066, + "language_loss": 0.74283946, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76507699, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 2.9835829734802246 + }, + { + "auxiliary_loss_clip": 0.01125078, + "auxiliary_loss_mlp": 0.01119265, + "balance_loss_clip": 1.00383878, + "balance_loss_mlp": 1.00024641, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7784083416136574, + "language_loss": 0.55485916, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57730258, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 4.535800933837891 + }, + { + "auxiliary_loss_clip": 0.01172632, + "auxiliary_loss_mlp": 0.01132727, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.00073814, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7890875438592317, + "language_loss": 0.65027153, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67332512, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.644228935241699 + }, + { + "auxiliary_loss_clip": 0.01140665, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00060058, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.7942822266566472, + "language_loss": 0.77549469, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79823005, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.565577268600464 + }, + { + "auxiliary_loss_clip": 0.01139011, + "auxiliary_loss_mlp": 0.01133566, + "balance_loss_clip": 1.00222731, + "balance_loss_mlp": 1.0006243, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.6909351978638816, + "language_loss": 0.76123965, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78396541, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.5542092323303223 + }, + { + "auxiliary_loss_clip": 0.01122854, + "auxiliary_loss_mlp": 0.01133905, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.0010581, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 1.9370365016029556, + "language_loss": 0.70893896, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73150653, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.6202428340911865 + }, + { + "auxiliary_loss_clip": 0.0112538, + "auxiliary_loss_mlp": 0.0113469, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.0007937, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.978361065480462, + "language_loss": 0.89982092, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92242157, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.629817485809326 + }, + { + "auxiliary_loss_clip": 0.01123856, + "auxiliary_loss_mlp": 0.01133133, + "balance_loss_clip": 1.00222957, + "balance_loss_mlp": 1.00066781, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.4622430486999625, + "language_loss": 0.80200839, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82457829, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 5.545819282531738 + }, + { + "auxiliary_loss_clip": 0.01172235, + "auxiliary_loss_mlp": 0.00746913, + "balance_loss_clip": 1.00404954, + "balance_loss_mlp": 1.00040662, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8636140568903863, + "language_loss": 0.55895436, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5781458, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.0549473762512207 + }, + { + "auxiliary_loss_clip": 0.01172792, + "auxiliary_loss_mlp": 0.01132767, + "balance_loss_clip": 1.00244832, + "balance_loss_mlp": 1.0008738, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 1.926027222989764, + "language_loss": 0.77992469, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8029803, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.583012580871582 + }, + { + "auxiliary_loss_clip": 0.01122661, + "auxiliary_loss_mlp": 0.01132669, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00096703, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.272450965848676, + "language_loss": 0.72808862, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.75064194, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.725940227508545 + }, + { + "auxiliary_loss_clip": 0.0115581, + "auxiliary_loss_mlp": 0.01132464, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.00076175, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.6968229727100883, + "language_loss": 0.87941241, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90229511, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.628418207168579 + }, + { + "auxiliary_loss_clip": 0.01139422, + "auxiliary_loss_mlp": 0.01134374, + "balance_loss_clip": 1.0021596, + "balance_loss_mlp": 1.00085998, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.3070415175277543, + "language_loss": 0.82327461, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84601259, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.580156087875366 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_clip": 1.00228405, + "balance_loss_mlp": 1.00072336, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.3878740597509576, + "language_loss": 0.72892439, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75181389, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.6269614696502686 + }, + { + "auxiliary_loss_clip": 0.01129526, + "auxiliary_loss_mlp": 0.01134016, + "balance_loss_clip": 1.00262749, + "balance_loss_mlp": 1.00097847, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.6790681119327264, + "language_loss": 0.8161754, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8388108, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.7090845108032227 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.00748098, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00104356, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 2.9463545900671866, + "language_loss": 0.71541411, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73386371, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.689891815185547 + }, + { + "auxiliary_loss_clip": 0.01107074, + "auxiliary_loss_mlp": 0.01133599, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00103831, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 1.898470383347692, + "language_loss": 0.81172752, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83413422, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.6702702045440674 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.01133598, + "balance_loss_clip": 1.00261593, + "balance_loss_mlp": 1.00075161, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.7558708172161743, + "language_loss": 0.78216624, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80495858, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.614032745361328 + }, + { + "auxiliary_loss_clip": 0.01124, + "auxiliary_loss_mlp": 0.01134003, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00077426, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.2221040391618168, + "language_loss": 0.74815708, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77073717, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.59757399559021 + }, + { + "auxiliary_loss_clip": 0.01140907, + "auxiliary_loss_mlp": 0.01133823, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00097656, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.528120735903899, + "language_loss": 0.82076359, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84351093, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.5805060863494873 + }, + { + "auxiliary_loss_clip": 0.01139356, + "auxiliary_loss_mlp": 0.01133325, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.00105047, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9587545566996882, + "language_loss": 0.77308011, + "learning_rate": 3.097034711451581e-06, + "loss": 0.795807, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.698122024536133 + }, + { + "auxiliary_loss_clip": 0.01139499, + "auxiliary_loss_mlp": 0.011334, + "balance_loss_clip": 1.00221133, + "balance_loss_mlp": 1.00074363, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.638182262360704, + "language_loss": 0.76223373, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78496277, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.5966451168060303 + }, + { + "auxiliary_loss_clip": 0.01157056, + "auxiliary_loss_mlp": 0.01133096, + "balance_loss_clip": 1.00225544, + "balance_loss_mlp": 1.00082135, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.5014905084675605, + "language_loss": 0.77592158, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79882306, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.593228578567505 + }, + { + "auxiliary_loss_clip": 0.01123744, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_clip": 1.00220633, + "balance_loss_mlp": 1.00084376, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 2.294617522992424, + "language_loss": 0.81132573, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83391058, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.6191418170928955 + }, + { + "auxiliary_loss_clip": 0.01172787, + "auxiliary_loss_mlp": 0.01133366, + "balance_loss_clip": 1.00245154, + "balance_loss_mlp": 1.00090027, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7854897334887987, + "language_loss": 0.67037189, + "learning_rate": 3.095731802118677e-06, + "loss": 0.6934334, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.4847588539123535 + }, + { + "auxiliary_loss_clip": 0.01142387, + "auxiliary_loss_mlp": 0.00748169, + "balance_loss_clip": 1.00236428, + "balance_loss_mlp": 1.0010314, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 3.096592294119392, + "language_loss": 0.70242625, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72133183, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.6596946716308594 + }, + { + "auxiliary_loss_clip": 0.01140998, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_clip": 1.00226665, + "balance_loss_mlp": 1.00084376, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 4.538911984610956, + "language_loss": 0.6723479, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.6950947, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.6399166584014893 + }, + { + "auxiliary_loss_clip": 0.01125011, + "auxiliary_loss_mlp": 0.0113324, + "balance_loss_clip": 1.0022403, + "balance_loss_mlp": 1.00087023, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.9356060544967197, + "language_loss": 0.73565638, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75823891, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.6410574913024902 + }, + { + "auxiliary_loss_clip": 0.01172879, + "auxiliary_loss_mlp": 0.01133195, + "balance_loss_clip": 1.00243914, + "balance_loss_mlp": 1.00092053, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 2.093464660836758, + "language_loss": 0.69916511, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72222579, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.5022194385528564 + }, + { + "auxiliary_loss_clip": 0.01138896, + "auxiliary_loss_mlp": 0.01133327, + "balance_loss_clip": 1.0021069, + "balance_loss_mlp": 1.00076628, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.003374465251256, + "language_loss": 0.76354444, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78626668, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.6089253425598145 + }, + { + "auxiliary_loss_clip": 0.0112496, + "auxiliary_loss_mlp": 0.00748247, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00098324, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.371314013094607, + "language_loss": 0.72425151, + "learning_rate": 3.093776191858731e-06, + "loss": 0.74298352, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.643258810043335 + }, + { + "auxiliary_loss_clip": 0.01108908, + "auxiliary_loss_mlp": 0.0074844, + "balance_loss_clip": 1.00205219, + "balance_loss_mlp": 1.00116634, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.5745252120722173, + "language_loss": 0.80001527, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81858873, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.6823980808258057 + }, + { + "auxiliary_loss_clip": 0.01139615, + "auxiliary_loss_mlp": 0.01133742, + "balance_loss_clip": 1.00230944, + "balance_loss_mlp": 1.00099099, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.6716510778709626, + "language_loss": 0.81588483, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.8386184, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.63415265083313 + }, + { + "auxiliary_loss_clip": 0.01139212, + "auxiliary_loss_mlp": 0.01133418, + "balance_loss_clip": 1.00214052, + "balance_loss_mlp": 1.00076222, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.4878001596872132, + "language_loss": 0.75574899, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.77847528, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 4.052539348602295 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01133515, + "balance_loss_clip": 1.00245726, + "balance_loss_mlp": 1.00066781, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.7181383303525704, + "language_loss": 0.78708482, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80999273, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.5875625610351562 + }, + { + "auxiliary_loss_clip": 0.01172969, + "auxiliary_loss_mlp": 0.0113447, + "balance_loss_clip": 1.00238323, + "balance_loss_mlp": 1.00066948, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.584933225738235, + "language_loss": 0.64862114, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.67169559, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.722755193710327 + }, + { + "auxiliary_loss_clip": 0.01125794, + "auxiliary_loss_mlp": 0.01134367, + "balance_loss_clip": 1.00228167, + "balance_loss_mlp": 1.00085223, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 3.791576393518216, + "language_loss": 0.82215726, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84475887, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.5696780681610107 + }, + { + "auxiliary_loss_clip": 0.01157121, + "auxiliary_loss_mlp": 0.01134189, + "balance_loss_clip": 1.00234723, + "balance_loss_mlp": 1.00096083, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.0039882629035786, + "language_loss": 0.82815826, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.8510713, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.5398738384246826 + }, + { + "auxiliary_loss_clip": 0.01157039, + "auxiliary_loss_mlp": 0.01133128, + "balance_loss_clip": 1.00247693, + "balance_loss_mlp": 1.00085282, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.7934904253871924, + "language_loss": 0.82927948, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.8521812, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.515561819076538 + }, + { + "auxiliary_loss_clip": 0.01172995, + "auxiliary_loss_mlp": 0.01134223, + "balance_loss_clip": 1.00252676, + "balance_loss_mlp": 1.00118494, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.7125546565654914, + "language_loss": 0.6887297, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71180189, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 3.888420343399048 + }, + { + "auxiliary_loss_clip": 0.01140733, + "auxiliary_loss_mlp": 0.0113509, + "balance_loss_clip": 1.0023762, + "balance_loss_mlp": 1.00119388, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.8901005456179987, + "language_loss": 0.83128834, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85404652, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.6347570419311523 + }, + { + "auxiliary_loss_clip": 0.01124477, + "auxiliary_loss_mlp": 0.01134096, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.0010587, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.4579941040771216, + "language_loss": 0.73800957, + "learning_rate": 3.090187030294409e-06, + "loss": 0.76059532, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.690826177597046 + }, + { + "auxiliary_loss_clip": 0.01140214, + "auxiliary_loss_mlp": 0.01134624, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00082338, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.6644278899578713, + "language_loss": 0.83699012, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85973853, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.5612881183624268 + }, + { + "auxiliary_loss_clip": 0.01140273, + "auxiliary_loss_mlp": 0.01134191, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00096226, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.899747751597929, + "language_loss": 0.67857015, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70131475, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.6249825954437256 + }, + { + "auxiliary_loss_clip": 0.01157634, + "auxiliary_loss_mlp": 0.01134936, + "balance_loss_clip": 1.0023675, + "balance_loss_mlp": 1.00094485, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 1.7544736110282677, + "language_loss": 0.71053386, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73345953, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 5.6033806800842285 + }, + { + "auxiliary_loss_clip": 0.01076929, + "auxiliary_loss_mlp": 0.01134369, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 1.00114059, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 2.0942134884766874, + "language_loss": 0.79007232, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81218529, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.727020025253296 + }, + { + "auxiliary_loss_clip": 0.01156366, + "auxiliary_loss_mlp": 0.01134751, + "balance_loss_clip": 1.00243556, + "balance_loss_mlp": 1.00114083, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.6428187328246298, + "language_loss": 0.82401752, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84692872, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.5731148719787598 + }, + { + "auxiliary_loss_clip": 0.01155997, + "auxiliary_loss_mlp": 0.01133689, + "balance_loss_clip": 1.00232327, + "balance_loss_mlp": 1.00084162, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 2.791464625078872, + "language_loss": 0.81974322, + "learning_rate": 3.088227196412879e-06, + "loss": 0.8426401, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.5313336849212646 + }, + { + "auxiliary_loss_clip": 0.01140551, + "auxiliary_loss_mlp": 0.01134197, + "balance_loss_clip": 1.00244808, + "balance_loss_mlp": 1.00087333, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.660629018162014, + "language_loss": 0.79588771, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81863523, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.6443233489990234 + }, + { + "auxiliary_loss_clip": 0.01090407, + "auxiliary_loss_mlp": 0.0113407, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00093746, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.989715189745876, + "language_loss": 0.70425069, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7264955, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.83361554145813 + }, + { + "auxiliary_loss_clip": 0.01140975, + "auxiliary_loss_mlp": 0.01134248, + "balance_loss_clip": 1.00231433, + "balance_loss_mlp": 1.00101972, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 2.634256482793357, + "language_loss": 0.79406399, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81681627, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.5912277698516846 + }, + { + "auxiliary_loss_clip": 0.01141318, + "auxiliary_loss_mlp": 0.01134346, + "balance_loss_clip": 1.00235128, + "balance_loss_mlp": 1.00092709, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.7404874956601266, + "language_loss": 0.91302049, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93577713, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.619539260864258 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01134082, + "balance_loss_clip": 1.00239205, + "balance_loss_mlp": 1.0011394, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.739444155367004, + "language_loss": 0.80971652, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83262038, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.660895824432373 + }, + { + "auxiliary_loss_clip": 0.01157108, + "auxiliary_loss_mlp": 0.00748386, + "balance_loss_clip": 1.00236261, + "balance_loss_mlp": 1.00114036, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.1883579733978586, + "language_loss": 0.83911079, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85816574, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.5449938774108887 + }, + { + "auxiliary_loss_clip": 0.0108293, + "auxiliary_loss_mlp": 0.01133574, + "balance_loss_clip": 1.0024848, + "balance_loss_mlp": 1.0010128, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.6811754059974513, + "language_loss": 0.80098069, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82314575, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.7414023876190186 + }, + { + "auxiliary_loss_clip": 0.01108833, + "auxiliary_loss_mlp": 0.01133837, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00089478, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9230576551049554, + "language_loss": 0.71130937, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73373604, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.716517925262451 + }, + { + "auxiliary_loss_clip": 0.01140679, + "auxiliary_loss_mlp": 0.01133645, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00108409, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.8396098274383132, + "language_loss": 0.70061535, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72335857, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.6048665046691895 + }, + { + "auxiliary_loss_clip": 0.01172964, + "auxiliary_loss_mlp": 0.01133978, + "balance_loss_clip": 1.0025115, + "balance_loss_mlp": 1.00103605, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.8807470216746407, + "language_loss": 0.67884874, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70191824, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.55244779586792 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01133702, + "balance_loss_clip": 1.00222635, + "balance_loss_mlp": 1.00104582, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.6600297662430616, + "language_loss": 0.82977825, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85250825, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.633107900619507 + }, + { + "auxiliary_loss_clip": 0.01123105, + "auxiliary_loss_mlp": 0.01133538, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00069118, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4325544807726625, + "language_loss": 0.73388147, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75644797, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.65632963180542 + }, + { + "auxiliary_loss_clip": 0.01141865, + "auxiliary_loss_mlp": 0.01120223, + "balance_loss_clip": 1.00408006, + "balance_loss_mlp": 1.00044119, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7312185957026157, + "language_loss": 0.54845464, + "learning_rate": 3.083975796930215e-06, + "loss": 0.5710755, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.370133876800537 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.0113397, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00112283, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.1955567275530568, + "language_loss": 0.73491073, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75749195, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.6594817638397217 + }, + { + "auxiliary_loss_clip": 0.01156291, + "auxiliary_loss_mlp": 0.01133982, + "balance_loss_clip": 1.00227857, + "balance_loss_mlp": 1.00094461, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 3.0350417052180774, + "language_loss": 0.70660353, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72950631, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.577906608581543 + }, + { + "auxiliary_loss_clip": 0.01140808, + "auxiliary_loss_mlp": 0.01133227, + "balance_loss_clip": 1.00224614, + "balance_loss_mlp": 1.00076163, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.680850471987295, + "language_loss": 0.81214237, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83488268, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.6264915466308594 + }, + { + "auxiliary_loss_clip": 0.01156145, + "auxiliary_loss_mlp": 0.00748473, + "balance_loss_clip": 1.00221181, + "balance_loss_mlp": 1.00127625, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 1.7835895825702275, + "language_loss": 0.79943103, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.81847721, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.566098928451538 + }, + { + "auxiliary_loss_clip": 0.01107202, + "auxiliary_loss_mlp": 0.01134563, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00076246, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.931132402581593, + "language_loss": 0.7697556, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79217321, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.6789681911468506 + }, + { + "auxiliary_loss_clip": 0.01140795, + "auxiliary_loss_mlp": 0.01134375, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00086105, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.9725303805048948, + "language_loss": 0.847767, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87051868, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.5632457733154297 + }, + { + "auxiliary_loss_clip": 0.01107454, + "auxiliary_loss_mlp": 0.0113415, + "balance_loss_clip": 1.00210154, + "balance_loss_mlp": 1.0013032, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.0027175385534544, + "language_loss": 0.71331441, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73573047, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.6978912353515625 + }, + { + "auxiliary_loss_clip": 0.0113936, + "auxiliary_loss_mlp": 0.01120219, + "balance_loss_clip": 1.00407791, + "balance_loss_mlp": 1.00043774, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8428300291357563, + "language_loss": 0.56156564, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.5841614, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.2283244132995605 + }, + { + "auxiliary_loss_clip": 0.0115642, + "auxiliary_loss_mlp": 0.01133454, + "balance_loss_clip": 1.00210571, + "balance_loss_mlp": 1.00070262, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.925942051645976, + "language_loss": 0.8019188, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82481754, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.617825746536255 + }, + { + "auxiliary_loss_clip": 0.01125741, + "auxiliary_loss_mlp": 0.01134146, + "balance_loss_clip": 1.00216985, + "balance_loss_mlp": 1.00101304, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 3.1202717394420367, + "language_loss": 0.59706938, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61966825, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.662842035293579 + }, + { + "auxiliary_loss_clip": 0.01139654, + "auxiliary_loss_mlp": 0.01133767, + "balance_loss_clip": 1.00215232, + "balance_loss_mlp": 1.00082493, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.7916882771684497, + "language_loss": 0.92485255, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94758677, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 4.4922544956207275 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01133849, + "balance_loss_clip": 1.00227928, + "balance_loss_mlp": 1.00071657, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.9360841125250399, + "language_loss": 0.74852848, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77111423, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.6332309246063232 + }, + { + "auxiliary_loss_clip": 0.01157285, + "auxiliary_loss_mlp": 0.01133496, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00093532, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.9200192431712158, + "language_loss": 0.83131045, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85421824, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.586568832397461 + }, + { + "auxiliary_loss_clip": 0.01107202, + "auxiliary_loss_mlp": 0.01134493, + "balance_loss_clip": 1.0021019, + "balance_loss_mlp": 1.00069273, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.65082857911356, + "language_loss": 0.69875574, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72117263, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.6644749641418457 + }, + { + "auxiliary_loss_clip": 0.01141164, + "auxiliary_loss_mlp": 0.01134127, + "balance_loss_clip": 1.00233078, + "balance_loss_mlp": 1.00118434, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.7164433350831625, + "language_loss": 0.81196231, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83471525, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.6323254108428955 + }, + { + "auxiliary_loss_clip": 0.01172732, + "auxiliary_loss_mlp": 0.01134093, + "balance_loss_clip": 1.00226653, + "balance_loss_mlp": 1.001055, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.3666061079373875, + "language_loss": 0.67710167, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70016992, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.4909162521362305 + }, + { + "auxiliary_loss_clip": 0.01140356, + "auxiliary_loss_mlp": 0.01134384, + "balance_loss_clip": 1.00228, + "balance_loss_mlp": 1.00077462, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.6428472195180737, + "language_loss": 0.69811845, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72086591, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 3.9535865783691406 + }, + { + "auxiliary_loss_clip": 0.01173014, + "auxiliary_loss_mlp": 0.01134697, + "balance_loss_clip": 1.0024457, + "balance_loss_mlp": 1.00099182, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 2.942579282585786, + "language_loss": 0.87262619, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89570332, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.544330596923828 + }, + { + "auxiliary_loss_clip": 0.01155876, + "auxiliary_loss_mlp": 0.01132566, + "balance_loss_clip": 1.00227833, + "balance_loss_mlp": 1.00057757, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 3.2375534473423118, + "language_loss": 0.83851457, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86139894, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.513746500015259 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01133693, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00103688, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 4.259382756109403, + "language_loss": 0.76731777, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79006052, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.580965995788574 + }, + { + "auxiliary_loss_clip": 0.01156245, + "auxiliary_loss_mlp": 0.01133561, + "balance_loss_clip": 1.00212228, + "balance_loss_mlp": 1.00100052, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 2.9000744868474384, + "language_loss": 0.62558138, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.64847946, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.5178685188293457 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01133266, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.000705, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 1.7967146265249545, + "language_loss": 0.76513124, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78802484, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 5.43395209312439 + }, + { + "auxiliary_loss_clip": 0.01156413, + "auxiliary_loss_mlp": 0.01134691, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00089097, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.0598841204832765, + "language_loss": 0.78798258, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81089365, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.5314292907714844 + }, + { + "auxiliary_loss_clip": 0.01139365, + "auxiliary_loss_mlp": 0.00748254, + "balance_loss_clip": 1.00235128, + "balance_loss_mlp": 1.00105047, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.6477949127668543, + "language_loss": 0.77762675, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79650295, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.620946168899536 + }, + { + "auxiliary_loss_clip": 0.01077721, + "auxiliary_loss_mlp": 0.01119553, + "balance_loss_clip": 1.00270462, + "balance_loss_mlp": 1.00053453, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7861957884336984, + "language_loss": 0.56330699, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5852797, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.3962743282318115 + }, + { + "auxiliary_loss_clip": 0.01140813, + "auxiliary_loss_mlp": 0.00748221, + "balance_loss_clip": 1.00225592, + "balance_loss_mlp": 1.00115132, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.6778693160607214, + "language_loss": 0.85902023, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87791055, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 2.8227596282958984 + }, + { + "auxiliary_loss_clip": 0.01157066, + "auxiliary_loss_mlp": 0.01133589, + "balance_loss_clip": 1.00222576, + "balance_loss_mlp": 1.00083733, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6057475691035865, + "language_loss": 0.70956242, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73246896, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 2.674365997314453 + }, + { + "auxiliary_loss_clip": 0.01106948, + "auxiliary_loss_mlp": 0.01134463, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00094914, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 2.0390450428057916, + "language_loss": 0.81348193, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83589613, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.6507840156555176 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0113466, + "balance_loss_clip": 1.0024929, + "balance_loss_mlp": 1.0008595, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.66575804492944, + "language_loss": 0.77176678, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.7948432, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.548485517501831 + }, + { + "auxiliary_loss_clip": 0.01157345, + "auxiliary_loss_mlp": 0.01133582, + "balance_loss_clip": 1.0023154, + "balance_loss_mlp": 1.0007354, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.531763446678079, + "language_loss": 0.86069894, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.88360822, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.5061421394348145 + }, + { + "auxiliary_loss_clip": 0.01155977, + "auxiliary_loss_mlp": 0.01134429, + "balance_loss_clip": 1.00225699, + "balance_loss_mlp": 1.0008198, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 4.562890577384245, + "language_loss": 0.6540634, + "learning_rate": 3.073809861919351e-06, + "loss": 0.6769675, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.571437358856201 + }, + { + "auxiliary_loss_clip": 0.0115614, + "auxiliary_loss_mlp": 0.01133851, + "balance_loss_clip": 1.00242448, + "balance_loss_mlp": 1.00119495, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4508368519664887, + "language_loss": 0.76365507, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78655493, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.6088078022003174 + }, + { + "auxiliary_loss_clip": 0.01125607, + "auxiliary_loss_mlp": 0.01134443, + "balance_loss_clip": 1.00216854, + "balance_loss_mlp": 1.00083315, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.7221806736093956, + "language_loss": 0.83011574, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85271627, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.629556179046631 + }, + { + "auxiliary_loss_clip": 0.01139422, + "auxiliary_loss_mlp": 0.01133823, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.00088072, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.9381416441542125, + "language_loss": 0.8522824, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87501478, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.6186890602111816 + }, + { + "auxiliary_loss_clip": 0.01155164, + "auxiliary_loss_mlp": 0.01119026, + "balance_loss_clip": 1.00376058, + "balance_loss_mlp": 1.00000763, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.810365225804843, + "language_loss": 0.60044193, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62318379, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.1470787525177 + }, + { + "auxiliary_loss_clip": 0.01172727, + "auxiliary_loss_mlp": 0.0113318, + "balance_loss_clip": 1.00243318, + "balance_loss_mlp": 1.00090492, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.8988040656728118, + "language_loss": 0.67486209, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.69792116, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.540722370147705 + }, + { + "auxiliary_loss_clip": 0.01172852, + "auxiliary_loss_mlp": 0.0113429, + "balance_loss_clip": 1.00254071, + "balance_loss_mlp": 1.00106192, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.7885561010665303, + "language_loss": 0.674505, + "learning_rate": 3.071837730274918e-06, + "loss": 0.6975764, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.5676400661468506 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.00235879, + "balance_loss_mlp": 1.00092363, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.8038542719788493, + "language_loss": 0.78821933, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81094885, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.6305975914001465 + }, + { + "auxiliary_loss_clip": 0.0112407, + "auxiliary_loss_mlp": 0.01133994, + "balance_loss_clip": 1.00220716, + "balance_loss_mlp": 1.00095654, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.773073328330758, + "language_loss": 0.73358637, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75616705, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.6783628463745117 + }, + { + "auxiliary_loss_clip": 0.01124342, + "auxiliary_loss_mlp": 0.01132511, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00099885, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.8797938000070484, + "language_loss": 0.86447477, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88704336, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.5960774421691895 + }, + { + "auxiliary_loss_clip": 0.01172901, + "auxiliary_loss_mlp": 0.01133522, + "balance_loss_clip": 1.0024302, + "balance_loss_mlp": 1.00105703, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 2.441414430513279, + "language_loss": 0.69161963, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71468383, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.5114049911499023 + }, + { + "auxiliary_loss_clip": 0.01172803, + "auxiliary_loss_mlp": 0.01134004, + "balance_loss_clip": 1.00243068, + "balance_loss_mlp": 1.00087142, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.4465478100831546, + "language_loss": 0.73307639, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.7561444, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.4846465587615967 + }, + { + "auxiliary_loss_clip": 0.01157223, + "auxiliary_loss_mlp": 0.01134196, + "balance_loss_clip": 1.00229394, + "balance_loss_mlp": 1.00087237, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5621035594000003, + "language_loss": 0.7310673, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75398147, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.557577610015869 + }, + { + "auxiliary_loss_clip": 0.01154841, + "auxiliary_loss_mlp": 0.01119099, + "balance_loss_clip": 1.00362563, + "balance_loss_mlp": 1.00008035, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8250342414834384, + "language_loss": 0.63301337, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65575278, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.3031535148620605 + }, + { + "auxiliary_loss_clip": 0.01047743, + "auxiliary_loss_mlp": 0.01133399, + "balance_loss_clip": 1.00228667, + "balance_loss_mlp": 1.00112474, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 1.7705084498783574, + "language_loss": 0.7208553, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74266672, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.808285713195801 + }, + { + "auxiliary_loss_clip": 0.01122267, + "auxiliary_loss_mlp": 0.00748074, + "balance_loss_clip": 1.00211847, + "balance_loss_mlp": 1.00097215, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 2.0976611785739365, + "language_loss": 0.80553925, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82424259, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 2.612701654434204 + }, + { + "auxiliary_loss_clip": 0.01110321, + "auxiliary_loss_mlp": 0.01134524, + "balance_loss_clip": 1.00232697, + "balance_loss_mlp": 1.00091445, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.9997422680723125, + "language_loss": 0.77224863, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79469705, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.6898281574249268 + }, + { + "auxiliary_loss_clip": 0.01172818, + "auxiliary_loss_mlp": 0.00748138, + "balance_loss_clip": 1.00243974, + "balance_loss_mlp": 1.00099206, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.8678395723796963, + "language_loss": 0.73896384, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.75817341, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 3.9917333126068115 + }, + { + "auxiliary_loss_clip": 0.0115737, + "auxiliary_loss_mlp": 0.0113342, + "balance_loss_clip": 1.00230122, + "balance_loss_mlp": 1.00085902, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.7763918644562275, + "language_loss": 0.73658651, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75949442, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.5374855995178223 + }, + { + "auxiliary_loss_clip": 0.01157379, + "auxiliary_loss_mlp": 0.01132939, + "balance_loss_clip": 1.00227463, + "balance_loss_mlp": 1.00085473, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.770598764339022, + "language_loss": 0.80103302, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82393622, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.871770143508911 + }, + { + "auxiliary_loss_clip": 0.01171702, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_clip": 1.00368881, + "balance_loss_mlp": 1.00003397, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.8741609338603652, + "language_loss": 0.56064224, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58354217, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.369708776473999 + }, + { + "auxiliary_loss_clip": 0.01140675, + "auxiliary_loss_mlp": 0.00747945, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00082612, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6132217575263625, + "language_loss": 0.79325235, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81213856, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.5888140201568604 + }, + { + "auxiliary_loss_clip": 0.01157403, + "auxiliary_loss_mlp": 0.01133344, + "balance_loss_clip": 1.00219643, + "balance_loss_mlp": 1.00068784, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.720368099906418, + "language_loss": 0.85578239, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87868989, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.549726963043213 + }, + { + "auxiliary_loss_clip": 0.01139206, + "auxiliary_loss_mlp": 0.01133518, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00076687, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 1.8386358699003484, + "language_loss": 0.79377413, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81650138, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.622253656387329 + }, + { + "auxiliary_loss_clip": 0.01155939, + "auxiliary_loss_mlp": 0.01133333, + "balance_loss_clip": 1.00222123, + "balance_loss_mlp": 1.00077176, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.7668761945001725, + "language_loss": 0.75323749, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.7761302, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 4.077412128448486 + }, + { + "auxiliary_loss_clip": 0.01154353, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_clip": 1.00344193, + "balance_loss_mlp": 1.00012195, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7197211142009984, + "language_loss": 0.59412092, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61684823, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.27083420753479 + }, + { + "auxiliary_loss_clip": 0.01140194, + "auxiliary_loss_mlp": 0.01132656, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00066733, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.0615862522196884, + "language_loss": 0.72148478, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74421322, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.595407247543335 + }, + { + "auxiliary_loss_clip": 0.01139963, + "auxiliary_loss_mlp": 0.01132949, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00096035, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.426124387476963, + "language_loss": 0.7106601, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73338926, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.622952461242676 + }, + { + "auxiliary_loss_clip": 0.01172483, + "auxiliary_loss_mlp": 0.0113342, + "balance_loss_clip": 1.00220168, + "balance_loss_mlp": 1.00085902, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4231839319940651, + "language_loss": 0.84093332, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86399233, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.552093982696533 + }, + { + "auxiliary_loss_clip": 0.01139252, + "auxiliary_loss_mlp": 0.01133911, + "balance_loss_clip": 1.00218487, + "balance_loss_mlp": 1.00096893, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 1.8531240955845592, + "language_loss": 0.70940733, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.73213893, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 5.413526296615601 + }, + { + "auxiliary_loss_clip": 0.01172658, + "auxiliary_loss_mlp": 0.01132962, + "balance_loss_clip": 1.00238395, + "balance_loss_mlp": 1.00087798, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.3798668167668702, + "language_loss": 0.74920607, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77226228, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.5757083892822266 + }, + { + "auxiliary_loss_clip": 0.01157137, + "auxiliary_loss_mlp": 0.01132641, + "balance_loss_clip": 1.00225878, + "balance_loss_mlp": 1.0007478, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.8903512690223965, + "language_loss": 0.70925832, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.73215604, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.629589796066284 + }, + { + "auxiliary_loss_clip": 0.0115587, + "auxiliary_loss_mlp": 0.01133567, + "balance_loss_clip": 1.00214744, + "balance_loss_mlp": 1.00071979, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 4.11043216159725, + "language_loss": 0.78333008, + "learning_rate": 3.06327495310661e-06, + "loss": 0.80622447, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.5223500728607178 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01133191, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00082076, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 2.6429591817324964, + "language_loss": 0.86716586, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88990593, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.549257278442383 + }, + { + "auxiliary_loss_clip": 0.01146391, + "auxiliary_loss_mlp": 0.01133362, + "balance_loss_clip": 1.00263858, + "balance_loss_mlp": 1.0007062, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 3.337680799071412, + "language_loss": 0.79648662, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.8192842, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.541377305984497 + }, + { + "auxiliary_loss_clip": 0.01156747, + "auxiliary_loss_mlp": 0.01133358, + "balance_loss_clip": 1.00220275, + "balance_loss_mlp": 1.00079679, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.485592364822152, + "language_loss": 0.73580313, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75870419, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.5173542499542236 + }, + { + "auxiliary_loss_clip": 0.01157198, + "auxiliary_loss_mlp": 0.0113304, + "balance_loss_clip": 1.00221157, + "balance_loss_mlp": 1.00066996, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 3.5481364542571954, + "language_loss": 0.76337451, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78627688, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.550894021987915 + }, + { + "auxiliary_loss_clip": 0.0115555, + "auxiliary_loss_mlp": 0.01132328, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00072122, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5502697533701502, + "language_loss": 0.6826126, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70549142, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.5469744205474854 + }, + { + "auxiliary_loss_clip": 0.01155914, + "auxiliary_loss_mlp": 0.01133285, + "balance_loss_clip": 1.00218463, + "balance_loss_mlp": 1.00072384, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 1.8370821266410866, + "language_loss": 0.72445095, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74734288, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.5153520107269287 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.01132979, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00079978, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.935609973841108, + "language_loss": 0.75419521, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77677208, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.61752986907959 + }, + { + "auxiliary_loss_clip": 0.01122406, + "auxiliary_loss_mlp": 0.0113255, + "balance_loss_clip": 1.00195086, + "balance_loss_mlp": 1.00084805, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.6377573852711442, + "language_loss": 0.79780984, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82035941, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.618809938430786 + }, + { + "auxiliary_loss_clip": 0.01107978, + "auxiliary_loss_mlp": 0.01132557, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00075936, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.8929814276204278, + "language_loss": 0.73336029, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75576568, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.6868021488189697 + }, + { + "auxiliary_loss_clip": 0.01131508, + "auxiliary_loss_mlp": 0.01133306, + "balance_loss_clip": 1.00273657, + "balance_loss_mlp": 1.00093627, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6623129975549185, + "language_loss": 0.7098245, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73247266, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.652578830718994 + }, + { + "auxiliary_loss_clip": 0.01144803, + "auxiliary_loss_mlp": 0.01132946, + "balance_loss_clip": 1.00255489, + "balance_loss_mlp": 1.00057626, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 18.652677066575325, + "language_loss": 0.8233977, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84617513, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.5836355686187744 + }, + { + "auxiliary_loss_clip": 0.01099207, + "auxiliary_loss_mlp": 0.01134334, + "balance_loss_clip": 1.0023855, + "balance_loss_mlp": 1.00091517, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 2.2652894289885457, + "language_loss": 0.69110155, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.7134369, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.707681894302368 + }, + { + "auxiliary_loss_clip": 0.01139176, + "auxiliary_loss_mlp": 0.01132801, + "balance_loss_clip": 1.00219071, + "balance_loss_mlp": 1.00081217, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.071628528335586, + "language_loss": 0.72494364, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74766338, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.6608238220214844 + }, + { + "auxiliary_loss_clip": 0.01139033, + "auxiliary_loss_mlp": 0.01133757, + "balance_loss_clip": 1.00224423, + "balance_loss_mlp": 1.00081491, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 1.8196011557106335, + "language_loss": 0.818618, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84134591, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.569056749343872 + }, + { + "auxiliary_loss_clip": 0.01156383, + "auxiliary_loss_mlp": 0.01133875, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.00102878, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.718137898322597, + "language_loss": 0.720788, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.74369061, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.5434513092041016 + }, + { + "auxiliary_loss_clip": 0.01137124, + "auxiliary_loss_mlp": 0.01118531, + "balance_loss_clip": 1.00280893, + "balance_loss_mlp": 1.00027561, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.871175807747991, + "language_loss": 0.57329243, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59584898, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.0695650577545166 + }, + { + "auxiliary_loss_clip": 0.01157424, + "auxiliary_loss_mlp": 0.01134036, + "balance_loss_clip": 1.0023334, + "balance_loss_mlp": 1.00061703, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 2.0892355456799545, + "language_loss": 0.74918777, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77210236, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.5398366451263428 + }, + { + "auxiliary_loss_clip": 0.01128308, + "auxiliary_loss_mlp": 0.01132779, + "balance_loss_clip": 1.00220621, + "balance_loss_mlp": 1.00098157, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 1.7656968006505351, + "language_loss": 0.72747135, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75008225, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.6049904823303223 + }, + { + "auxiliary_loss_clip": 0.01109423, + "auxiliary_loss_mlp": 0.011331, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00063503, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 2.111662289823172, + "language_loss": 0.79822469, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82064986, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.680474281311035 + }, + { + "auxiliary_loss_clip": 0.01140521, + "auxiliary_loss_mlp": 0.0113331, + "balance_loss_clip": 1.0020932, + "balance_loss_mlp": 1.00074935, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9689356258674868, + "language_loss": 0.8280862, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85082448, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.556058406829834 + }, + { + "auxiliary_loss_clip": 0.01155759, + "auxiliary_loss_mlp": 0.01133185, + "balance_loss_clip": 1.00223613, + "balance_loss_mlp": 1.0006237, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5875458825187203, + "language_loss": 0.75453353, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77742302, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.5034971237182617 + }, + { + "auxiliary_loss_clip": 0.01141077, + "auxiliary_loss_mlp": 0.01133275, + "balance_loss_clip": 1.00237083, + "balance_loss_mlp": 1.00090456, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6681125780277888, + "language_loss": 0.81411386, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83685732, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.6216890811920166 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01133294, + "balance_loss_clip": 1.0021466, + "balance_loss_mlp": 1.00082886, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.9256624324667575, + "language_loss": 0.78771907, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81044239, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 3.957986831665039 + }, + { + "auxiliary_loss_clip": 0.01155903, + "auxiliary_loss_mlp": 0.01133707, + "balance_loss_clip": 1.00217378, + "balance_loss_mlp": 1.00086045, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.7573867948686637, + "language_loss": 0.70287377, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72576988, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.528488874435425 + }, + { + "auxiliary_loss_clip": 0.01125567, + "auxiliary_loss_mlp": 0.00748426, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00124955, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.8771492095894644, + "language_loss": 0.67241955, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69115949, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.6050455570220947 + }, + { + "auxiliary_loss_clip": 0.01122016, + "auxiliary_loss_mlp": 0.01118533, + "balance_loss_clip": 1.00285625, + "balance_loss_mlp": 1.0002774, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8579427017874016, + "language_loss": 0.58112502, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60353053, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.2222602367401123 + }, + { + "auxiliary_loss_clip": 0.01172566, + "auxiliary_loss_mlp": 0.01133982, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.00103939, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.654416999589788, + "language_loss": 0.81036592, + "learning_rate": 3.054353992805076e-06, + "loss": 0.83343136, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.5403385162353516 + }, + { + "auxiliary_loss_clip": 0.01172562, + "auxiliary_loss_mlp": 0.0113319, + "balance_loss_clip": 1.00226092, + "balance_loss_mlp": 1.0008204, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.9917852806812044, + "language_loss": 0.72272408, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7457816, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.605311393737793 + }, + { + "auxiliary_loss_clip": 0.01153597, + "auxiliary_loss_mlp": 0.01119493, + "balance_loss_clip": 1.00247526, + "balance_loss_mlp": 1.00047481, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8915631714190592, + "language_loss": 0.65816975, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68090069, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 4.577608346939087 + }, + { + "auxiliary_loss_clip": 0.01157226, + "auxiliary_loss_mlp": 0.01132708, + "balance_loss_clip": 1.00225973, + "balance_loss_mlp": 1.00090981, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 1.8162076108776273, + "language_loss": 0.74532235, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76822174, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.529991388320923 + }, + { + "auxiliary_loss_clip": 0.01106416, + "auxiliary_loss_mlp": 0.01133053, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.0007782, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 3.5826784098722895, + "language_loss": 0.75346941, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77586412, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.735008716583252 + }, + { + "auxiliary_loss_clip": 0.01121842, + "auxiliary_loss_mlp": 0.0113347, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00090909, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 2.0284160371910076, + "language_loss": 0.63847405, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66102713, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.706839084625244 + }, + { + "auxiliary_loss_clip": 0.01108711, + "auxiliary_loss_mlp": 0.01134152, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00092363, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.269393718491831, + "language_loss": 0.73366237, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75609094, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.678805351257324 + }, + { + "auxiliary_loss_clip": 0.01157141, + "auxiliary_loss_mlp": 0.011336, + "balance_loss_clip": 1.00223231, + "balance_loss_mlp": 1.00075293, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.732400716352378, + "language_loss": 0.73899305, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76190048, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 3.9471168518066406 + }, + { + "auxiliary_loss_clip": 0.01140519, + "auxiliary_loss_mlp": 0.00748229, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.0011549, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 1.7812499027670412, + "language_loss": 0.80633467, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82522213, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.590484142303467 + }, + { + "auxiliary_loss_clip": 0.01107113, + "auxiliary_loss_mlp": 0.01132114, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00069749, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5965646940748615, + "language_loss": 0.81317794, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83557022, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.66316556930542 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01133083, + "balance_loss_clip": 1.00205266, + "balance_loss_mlp": 1.00090408, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 2.2698557712689165, + "language_loss": 0.81193233, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83451557, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.5817692279815674 + }, + { + "auxiliary_loss_clip": 0.01139932, + "auxiliary_loss_mlp": 0.01133389, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00082839, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.708714592674714, + "language_loss": 0.69140375, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71413696, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.7553470134735107 + }, + { + "auxiliary_loss_clip": 0.0115602, + "auxiliary_loss_mlp": 0.01134072, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00093889, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 2.561411168086425, + "language_loss": 0.69392508, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71682608, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.5539796352386475 + }, + { + "auxiliary_loss_clip": 0.01123374, + "auxiliary_loss_mlp": 0.01133478, + "balance_loss_clip": 1.00196648, + "balance_loss_mlp": 1.00101292, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.6680260516218803, + "language_loss": 0.72962731, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75219584, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.6579554080963135 + }, + { + "auxiliary_loss_clip": 0.01124332, + "auxiliary_loss_mlp": 0.01132724, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00083065, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.9858847304987122, + "language_loss": 0.88513637, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90770692, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.6070873737335205 + }, + { + "auxiliary_loss_clip": 0.01108024, + "auxiliary_loss_mlp": 0.01133225, + "balance_loss_clip": 1.00205505, + "balance_loss_mlp": 1.00075936, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 1.8235454200331398, + "language_loss": 0.70043832, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.7228508, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.6611061096191406 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01133404, + "balance_loss_clip": 1.00221801, + "balance_loss_mlp": 1.00084305, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.6759694779169523, + "language_loss": 0.73979896, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76268935, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.5169858932495117 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01133397, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00083637, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 6.303743002037028, + "language_loss": 0.7987116, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82129371, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.602733612060547 + }, + { + "auxiliary_loss_clip": 0.01155718, + "auxiliary_loss_mlp": 0.01132632, + "balance_loss_clip": 1.00225163, + "balance_loss_mlp": 1.00092971, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.105388782984496, + "language_loss": 0.78273493, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80561841, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.525346517562866 + }, + { + "auxiliary_loss_clip": 0.01122607, + "auxiliary_loss_mlp": 0.0111792, + "balance_loss_clip": 1.00278306, + "balance_loss_mlp": 1.00042713, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7397181685011394, + "language_loss": 0.53511906, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55752432, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.2128961086273193 + }, + { + "auxiliary_loss_clip": 0.01140163, + "auxiliary_loss_mlp": 0.01132945, + "balance_loss_clip": 1.00210297, + "balance_loss_mlp": 1.0007658, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.5760251575069775, + "language_loss": 0.83509398, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85782504, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.639686107635498 + }, + { + "auxiliary_loss_clip": 0.01139164, + "auxiliary_loss_mlp": 0.01133453, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00060666, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 1.8869978304324226, + "language_loss": 0.92800367, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.95072985, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.631056785583496 + }, + { + "auxiliary_loss_clip": 0.01106655, + "auxiliary_loss_mlp": 0.01133443, + "balance_loss_clip": 1.0019151, + "balance_loss_mlp": 1.00069141, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.7220488734109776, + "language_loss": 0.77100992, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.7934109, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.6699230670928955 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01132985, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00080609, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.5589080380279938, + "language_loss": 0.78960019, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81232888, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.6204569339752197 + }, + { + "auxiliary_loss_clip": 0.01114872, + "auxiliary_loss_mlp": 0.01134507, + "balance_loss_clip": 1.00260293, + "balance_loss_mlp": 1.00099218, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 1.7370506472363358, + "language_loss": 0.71270835, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73520213, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.6629631519317627 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01133494, + "balance_loss_clip": 1.00214517, + "balance_loss_mlp": 1.00093377, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.9833808524232632, + "language_loss": 0.81865406, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84124023, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.6971213817596436 + }, + { + "auxiliary_loss_clip": 0.0112394, + "auxiliary_loss_mlp": 0.01133784, + "balance_loss_clip": 1.00209987, + "balance_loss_mlp": 1.00093699, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.96237038531414, + "language_loss": 0.83122259, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85379982, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.690624952316284 + }, + { + "auxiliary_loss_clip": 0.01155383, + "auxiliary_loss_mlp": 0.0113423, + "balance_loss_clip": 1.00214863, + "balance_loss_mlp": 1.00081062, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 1.9654138259571663, + "language_loss": 0.76643944, + "learning_rate": 3.045403886269181e-06, + "loss": 0.78933555, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.5449752807617188 + }, + { + "auxiliary_loss_clip": 0.01140379, + "auxiliary_loss_mlp": 0.01133606, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00075948, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.437381208171229, + "language_loss": 0.77547395, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79821384, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.6324636936187744 + }, + { + "auxiliary_loss_clip": 0.0115727, + "auxiliary_loss_mlp": 0.01133395, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00092936, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.9847888951432708, + "language_loss": 0.76317668, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.7860834, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.5416603088378906 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01132615, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00091219, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.518057386392224, + "language_loss": 0.70185983, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72474229, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.6104671955108643 + }, + { + "auxiliary_loss_clip": 0.01172358, + "auxiliary_loss_mlp": 0.01132304, + "balance_loss_clip": 1.00222039, + "balance_loss_mlp": 1.00069666, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.5468453975301752, + "language_loss": 0.79328781, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81633443, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.4878926277160645 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.01133772, + "balance_loss_clip": 1.00186503, + "balance_loss_mlp": 1.00092506, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.8046084942103342, + "language_loss": 0.89556724, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91796041, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.631911516189575 + }, + { + "auxiliary_loss_clip": 0.0115578, + "auxiliary_loss_mlp": 0.01133816, + "balance_loss_clip": 1.00214994, + "balance_loss_mlp": 1.00068307, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 2.108399371641723, + "language_loss": 0.64258385, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66547978, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.5342838764190674 + }, + { + "auxiliary_loss_clip": 0.01141662, + "auxiliary_loss_mlp": 0.01132697, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00070882, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.6417536397075154, + "language_loss": 0.73042595, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75316954, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 4.110396146774292 + }, + { + "auxiliary_loss_clip": 0.01107535, + "auxiliary_loss_mlp": 0.01133012, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00083232, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.5176903858911222, + "language_loss": 0.7551626, + "learning_rate": 3.042746441843029e-06, + "loss": 0.7775681, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.708266496658325 + }, + { + "auxiliary_loss_clip": 0.01140283, + "auxiliary_loss_mlp": 0.01118184, + "balance_loss_clip": 1.00317705, + "balance_loss_mlp": 1.00069165, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8786085220765659, + "language_loss": 0.62621766, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64880234, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 3.047795057296753 + }, + { + "auxiliary_loss_clip": 0.01138736, + "auxiliary_loss_mlp": 0.01131807, + "balance_loss_clip": 1.00206423, + "balance_loss_mlp": 1.00058103, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 1.6765052232424815, + "language_loss": 0.80267251, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82537794, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.5916101932525635 + }, + { + "auxiliary_loss_clip": 0.01172362, + "auxiliary_loss_mlp": 0.01132741, + "balance_loss_clip": 1.00219131, + "balance_loss_mlp": 1.00113344, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.244647051121143, + "language_loss": 0.84009242, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86314344, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.5086922645568848 + }, + { + "auxiliary_loss_clip": 0.01137223, + "auxiliary_loss_mlp": 0.00746926, + "balance_loss_clip": 1.00288689, + "balance_loss_mlp": 1.0007143, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7310468419631646, + "language_loss": 0.63153934, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65038085, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.061016321182251 + }, + { + "auxiliary_loss_clip": 0.01140623, + "auxiliary_loss_mlp": 0.01132409, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00089741, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7222379254392057, + "language_loss": 0.71100485, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73373508, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.558318853378296 + }, + { + "auxiliary_loss_clip": 0.01155779, + "auxiliary_loss_mlp": 0.01133315, + "balance_loss_clip": 1.00204027, + "balance_loss_mlp": 1.00065899, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.5887452852583113, + "language_loss": 0.72476494, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.74765587, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 3.9507172107696533 + }, + { + "auxiliary_loss_clip": 0.01156754, + "auxiliary_loss_mlp": 0.01132569, + "balance_loss_clip": 1.00216556, + "balance_loss_mlp": 1.00058007, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6081943393574258, + "language_loss": 0.72489232, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74778557, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.6952099800109863 + }, + { + "auxiliary_loss_clip": 0.01154159, + "auxiliary_loss_mlp": 0.01117777, + "balance_loss_clip": 1.00318456, + "balance_loss_mlp": 1.00028408, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7215899782942707, + "language_loss": 0.62526822, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64798754, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.1193182468414307 + }, + { + "auxiliary_loss_clip": 0.01120613, + "auxiliary_loss_mlp": 0.0074691, + "balance_loss_clip": 1.00279331, + "balance_loss_mlp": 1.00059247, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8589382380890535, + "language_loss": 0.59225953, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61093473, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 4.546623945236206 + }, + { + "auxiliary_loss_clip": 0.01139974, + "auxiliary_loss_mlp": 0.01132787, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00089407, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.6927280122724642, + "language_loss": 0.71687335, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73960096, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 4.047734498977661 + }, + { + "auxiliary_loss_clip": 0.01106847, + "auxiliary_loss_mlp": 0.01132299, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00088239, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 2.508031505719813, + "language_loss": 0.83183545, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85422689, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.6897478103637695 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01117992, + "balance_loss_clip": 1.00267959, + "balance_loss_mlp": 1.00049913, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8088017715824867, + "language_loss": 0.56482041, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58723605, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.2484426498413086 + }, + { + "auxiliary_loss_clip": 0.01156657, + "auxiliary_loss_mlp": 0.00748249, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00123978, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 1.8857665078179728, + "language_loss": 0.94896036, + "learning_rate": 3.038422700166474e-06, + "loss": 0.96800935, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.5187246799468994 + }, + { + "auxiliary_loss_clip": 0.01125788, + "auxiliary_loss_mlp": 0.01131879, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00046229, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.689847155123497, + "language_loss": 0.69870198, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.72127867, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.6802008152008057 + }, + { + "auxiliary_loss_clip": 0.01157235, + "auxiliary_loss_mlp": 0.01133278, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00081289, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.5490279611690916, + "language_loss": 0.83968383, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86258888, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.5797083377838135 + }, + { + "auxiliary_loss_clip": 0.0114081, + "auxiliary_loss_mlp": 0.01131965, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00073946, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.1911701478322803, + "language_loss": 0.67624277, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.6989705, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.6616313457489014 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01133019, + "balance_loss_clip": 1.00225365, + "balance_loss_mlp": 1.00083923, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.955249156079344, + "language_loss": 0.77289236, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79562652, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.5769011974334717 + }, + { + "auxiliary_loss_clip": 0.01106953, + "auxiliary_loss_mlp": 0.01131399, + "balance_loss_clip": 1.00165343, + "balance_loss_mlp": 1.00065041, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5305918842288604, + "language_loss": 0.73471773, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75710124, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.6300721168518066 + }, + { + "auxiliary_loss_clip": 0.01140836, + "auxiliary_loss_mlp": 0.01133294, + "balance_loss_clip": 1.0025208, + "balance_loss_mlp": 1.001019, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 1.7294982873947113, + "language_loss": 0.77646804, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79920936, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.616218090057373 + }, + { + "auxiliary_loss_clip": 0.01153441, + "auxiliary_loss_mlp": 0.01117967, + "balance_loss_clip": 1.00295639, + "balance_loss_mlp": 1.00047481, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7687030301060152, + "language_loss": 0.57490075, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59761488, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.1690492630004883 + }, + { + "auxiliary_loss_clip": 0.01123468, + "auxiliary_loss_mlp": 0.01134058, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00073457, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.097657150469575, + "language_loss": 0.85926294, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.88183826, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.561697483062744 + }, + { + "auxiliary_loss_clip": 0.01139276, + "auxiliary_loss_mlp": 0.01116852, + "balance_loss_clip": 1.00308347, + "balance_loss_mlp": 1.00012207, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7635608987607269, + "language_loss": 0.59762341, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.62018472, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.9458839893341064 + }, + { + "auxiliary_loss_clip": 0.01157087, + "auxiliary_loss_mlp": 0.01133152, + "balance_loss_clip": 1.00216663, + "balance_loss_mlp": 1.00087774, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.8788735216904566, + "language_loss": 0.71609199, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73899436, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.640364646911621 + }, + { + "auxiliary_loss_clip": 0.01125105, + "auxiliary_loss_mlp": 0.0074825, + "balance_loss_clip": 1.00202632, + "balance_loss_mlp": 1.00117636, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.341003058386044, + "language_loss": 0.76498616, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78371972, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.6810507774353027 + }, + { + "auxiliary_loss_clip": 0.01157133, + "auxiliary_loss_mlp": 0.01132833, + "balance_loss_clip": 1.00209391, + "balance_loss_mlp": 1.00074887, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 1.8793016573238315, + "language_loss": 0.70274943, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72564912, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.542250156402588 + }, + { + "auxiliary_loss_clip": 0.01139939, + "auxiliary_loss_mlp": 0.0074824, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00118935, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.5170476438140357, + "language_loss": 0.76083302, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.7797147, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.6112887859344482 + }, + { + "auxiliary_loss_clip": 0.01140374, + "auxiliary_loss_mlp": 0.01133074, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00079978, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.8260972111513216, + "language_loss": 0.77724481, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79997927, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.5435585975646973 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01117049, + "balance_loss_clip": 1.00245881, + "balance_loss_mlp": 1.00031972, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8388531987000716, + "language_loss": 0.6336199, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65615678, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.198751926422119 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01133977, + "balance_loss_clip": 1.00231802, + "balance_loss_mlp": 1.00084376, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 1.8017887074706522, + "language_loss": 0.64835501, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67092478, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.739584445953369 + }, + { + "auxiliary_loss_clip": 0.01140329, + "auxiliary_loss_mlp": 0.01132742, + "balance_loss_clip": 1.00206578, + "balance_loss_mlp": 1.0009439, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.669555721245512, + "language_loss": 0.71349323, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73622394, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.7555463314056396 + }, + { + "auxiliary_loss_clip": 0.01172503, + "auxiliary_loss_mlp": 0.01133462, + "balance_loss_clip": 1.00226855, + "balance_loss_mlp": 1.00109243, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 2.4514413429801185, + "language_loss": 0.62053508, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64359474, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.56247878074646 + }, + { + "auxiliary_loss_clip": 0.01125014, + "auxiliary_loss_mlp": 0.01133103, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.0008285, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 3.4153491938032032, + "language_loss": 0.72019279, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74277389, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.638383388519287 + }, + { + "auxiliary_loss_clip": 0.01092606, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.001091, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 3.7504812413517783, + "language_loss": 0.76690876, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917229, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.678396701812744 + }, + { + "auxiliary_loss_clip": 0.01138913, + "auxiliary_loss_mlp": 0.01132058, + "balance_loss_clip": 1.00183737, + "balance_loss_mlp": 1.00054586, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.8873278557272026, + "language_loss": 0.6214813, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64419103, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.594731569290161 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01131634, + "balance_loss_clip": 1.00182033, + "balance_loss_mlp": 1.00059903, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6470111855599334, + "language_loss": 0.89078695, + "learning_rate": 3.031090453282605e-06, + "loss": 0.91318083, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.6668288707733154 + }, + { + "auxiliary_loss_clip": 0.01106812, + "auxiliary_loss_mlp": 0.01132587, + "balance_loss_clip": 1.00214088, + "balance_loss_mlp": 1.0006938, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.6927022628274562, + "language_loss": 0.81295532, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83534932, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.645871639251709 + }, + { + "auxiliary_loss_clip": 0.01142048, + "auxiliary_loss_mlp": 0.01132545, + "balance_loss_clip": 1.00226557, + "balance_loss_mlp": 1.00103331, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.671512414591794, + "language_loss": 0.80694807, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82969391, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 3.999284029006958 + }, + { + "auxiliary_loss_clip": 0.01172257, + "auxiliary_loss_mlp": 0.00748278, + "balance_loss_clip": 1.00223148, + "balance_loss_mlp": 1.00111485, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5529955532615434, + "language_loss": 0.74890047, + "learning_rate": 3.030089132216836e-06, + "loss": 0.7681058, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.487797498703003 + }, + { + "auxiliary_loss_clip": 0.01141938, + "auxiliary_loss_mlp": 0.00748442, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00136197, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.7934102878923595, + "language_loss": 0.81138349, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83028734, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.6599252223968506 + }, + { + "auxiliary_loss_clip": 0.01172468, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_clip": 1.0022521, + "balance_loss_mlp": 1.00071263, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.6999554984007614, + "language_loss": 0.85827661, + "learning_rate": 3.029421389513147e-06, + "loss": 0.88133693, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.5038907527923584 + }, + { + "auxiliary_loss_clip": 0.01155707, + "auxiliary_loss_mlp": 0.01133533, + "balance_loss_clip": 1.00220883, + "balance_loss_mlp": 1.00116277, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 2.2235471597519747, + "language_loss": 0.85028887, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87318122, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.5268537998199463 + }, + { + "auxiliary_loss_clip": 0.01157153, + "auxiliary_loss_mlp": 0.01133509, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00085294, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 1.884260572301979, + "language_loss": 0.81441468, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83732128, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.6533303260803223 + }, + { + "auxiliary_loss_clip": 0.01155581, + "auxiliary_loss_mlp": 0.01132649, + "balance_loss_clip": 1.00207961, + "balance_loss_mlp": 1.00066066, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.6676080233501744, + "language_loss": 0.77684677, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79972905, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 4.07875919342041 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.01131907, + "balance_loss_clip": 1.00207376, + "balance_loss_mlp": 1.00058651, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.7006031517660996, + "language_loss": 0.81479192, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83753049, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.5838205814361572 + }, + { + "auxiliary_loss_clip": 0.01155587, + "auxiliary_loss_mlp": 0.01133015, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.00121689, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7881126195601207, + "language_loss": 0.76251757, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78540355, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.5234391689300537 + }, + { + "auxiliary_loss_clip": 0.01157049, + "auxiliary_loss_mlp": 0.01132267, + "balance_loss_clip": 1.0022366, + "balance_loss_mlp": 1.00075579, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 1.8241062179514398, + "language_loss": 0.57290441, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59579754, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.531714677810669 + }, + { + "auxiliary_loss_clip": 0.01140559, + "auxiliary_loss_mlp": 0.0113221, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00069785, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.0071089214111852, + "language_loss": 0.82249653, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84522426, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 3.9978835582733154 + }, + { + "auxiliary_loss_clip": 0.01155502, + "auxiliary_loss_mlp": 0.01132079, + "balance_loss_clip": 1.00220442, + "balance_loss_mlp": 1.00085378, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.6720103681360865, + "language_loss": 0.836564, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85943985, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 4.043091773986816 + }, + { + "auxiliary_loss_clip": 0.01172226, + "auxiliary_loss_mlp": 0.01131842, + "balance_loss_clip": 1.00221336, + "balance_loss_mlp": 1.00080693, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.6313461891555756, + "language_loss": 0.73038316, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75342381, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.555467367172241 + }, + { + "auxiliary_loss_clip": 0.01172491, + "auxiliary_loss_mlp": 0.01132842, + "balance_loss_clip": 1.00228024, + "balance_loss_mlp": 1.00094879, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 1.7992362557734525, + "language_loss": 0.76258641, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78563976, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.5068743228912354 + }, + { + "auxiliary_loss_clip": 0.01082213, + "auxiliary_loss_mlp": 0.01132746, + "balance_loss_clip": 1.00269663, + "balance_loss_mlp": 1.00075769, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.6703987616858829, + "language_loss": 0.75926065, + "learning_rate": 3.025746016302734e-06, + "loss": 0.78141022, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 2.7192556858062744 + }, + { + "auxiliary_loss_clip": 0.01140661, + "auxiliary_loss_mlp": 0.00748178, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00119114, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 3.521781479591781, + "language_loss": 0.6766212, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69550955, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 2.777479648590088 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01132154, + "balance_loss_clip": 1.00231063, + "balance_loss_mlp": 1.00073791, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 2.4122302471810473, + "language_loss": 0.76880884, + "learning_rate": 3.025077260480735e-06, + "loss": 0.79155064, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.6084015369415283 + }, + { + "auxiliary_loss_clip": 0.01081058, + "auxiliary_loss_mlp": 0.01131793, + "balance_loss_clip": 1.00200093, + "balance_loss_mlp": 1.00085378, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.6873291875791607, + "language_loss": 0.79318684, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81531537, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.7205545902252197 + }, + { + "auxiliary_loss_clip": 0.01140393, + "auxiliary_loss_mlp": 0.00748339, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00137615, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 2.137902722101418, + "language_loss": 0.67974246, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.6986298, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01132114, + "balance_loss_clip": 1.00199544, + "balance_loss_mlp": 1.00069761, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.9179660781216852, + "language_loss": 0.762797, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78550017, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.5691580772399902 + }, + { + "auxiliary_loss_clip": 0.01124975, + "auxiliary_loss_mlp": 0.01132612, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.00071836, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 1.8369139966258725, + "language_loss": 0.67021328, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69278914, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.673112630844116 + }, + { + "auxiliary_loss_clip": 0.01155835, + "auxiliary_loss_mlp": 0.01132266, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00075412, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.733589520537147, + "language_loss": 0.7227596, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7456407, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.610964775085449 + }, + { + "auxiliary_loss_clip": 0.01172156, + "auxiliary_loss_mlp": 0.01132744, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00075507, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.8032412594463918, + "language_loss": 0.73728037, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76032931, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.5606040954589844 + }, + { + "auxiliary_loss_clip": 0.01172236, + "auxiliary_loss_mlp": 0.01132136, + "balance_loss_clip": 1.00233209, + "balance_loss_mlp": 1.00091052, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.9064935083174048, + "language_loss": 0.84524715, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86829084, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.5092122554779053 + }, + { + "auxiliary_loss_clip": 0.01140026, + "auxiliary_loss_mlp": 0.01131234, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00077152, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8984352416691763, + "language_loss": 0.80416751, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.8268801, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.60465145111084 + }, + { + "auxiliary_loss_clip": 0.01172167, + "auxiliary_loss_mlp": 0.01131863, + "balance_loss_clip": 1.00224316, + "balance_loss_mlp": 1.00082815, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.7763877926678795, + "language_loss": 0.75712293, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.78016323, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.5608115196228027 + }, + { + "auxiliary_loss_clip": 0.01140592, + "auxiliary_loss_mlp": 0.01131883, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.00075245, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4675775339283739, + "language_loss": 0.80025053, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82297522, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.6803622245788574 + }, + { + "auxiliary_loss_clip": 0.01091836, + "auxiliary_loss_mlp": 0.01132206, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00079012, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.7302659321821001, + "language_loss": 0.69381464, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71605504, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.6862449645996094 + }, + { + "auxiliary_loss_clip": 0.01141793, + "auxiliary_loss_mlp": 0.00748274, + "balance_loss_clip": 1.00209963, + "balance_loss_mlp": 1.00126719, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.048907808063399, + "language_loss": 0.76623988, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78514057, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.5954318046569824 + }, + { + "auxiliary_loss_clip": 0.01145856, + "auxiliary_loss_mlp": 0.00748336, + "balance_loss_clip": 1.00244296, + "balance_loss_mlp": 1.00137556, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.4812610134831028, + "language_loss": 0.84644139, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86538327, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.6278457641601562 + }, + { + "auxiliary_loss_clip": 0.01155717, + "auxiliary_loss_mlp": 0.01131928, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00079763, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 1.8558531316131865, + "language_loss": 0.77119184, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79406828, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.514909505844116 + }, + { + "auxiliary_loss_clip": 0.01155816, + "auxiliary_loss_mlp": 0.01132309, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00079691, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.7745683032585224, + "language_loss": 0.58719254, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.6100738, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.5559864044189453 + }, + { + "auxiliary_loss_clip": 0.01170723, + "auxiliary_loss_mlp": 0.01116806, + "balance_loss_clip": 1.00335336, + "balance_loss_mlp": 1.00007629, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8643275775646929, + "language_loss": 0.59884542, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62172073, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.1982967853546143 + }, + { + "auxiliary_loss_clip": 0.01125045, + "auxiliary_loss_mlp": 0.01131963, + "balance_loss_clip": 1.00211525, + "balance_loss_mlp": 1.00073719, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.849623885118888, + "language_loss": 0.83412218, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85669225, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.623206377029419 + }, + { + "auxiliary_loss_clip": 0.01140402, + "auxiliary_loss_mlp": 0.0113222, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00061321, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 2.5592965885070713, + "language_loss": 0.70909965, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73182595, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.6187098026275635 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.00077128, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.6162012693323005, + "language_loss": 0.70571375, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72860229, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.6416056156158447 + }, + { + "auxiliary_loss_clip": 0.01155313, + "auxiliary_loss_mlp": 0.01132412, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00080466, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.465446995825087, + "language_loss": 0.74011135, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.76298869, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.56740140914917 + }, + { + "auxiliary_loss_clip": 0.01138658, + "auxiliary_loss_mlp": 0.01131859, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00053811, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.59335307321025, + "language_loss": 0.78543842, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80814362, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.56478214263916 + }, + { + "auxiliary_loss_clip": 0.01153931, + "auxiliary_loss_mlp": 0.01116824, + "balance_loss_clip": 1.00328469, + "balance_loss_mlp": 1.00009429, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7074747498688768, + "language_loss": 0.59207594, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61478353, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 4.5590009689331055 + }, + { + "auxiliary_loss_clip": 0.01138397, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00078738, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8750089248174178, + "language_loss": 0.84612793, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86884058, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.5899975299835205 + }, + { + "auxiliary_loss_clip": 0.01155752, + "auxiliary_loss_mlp": 0.00748201, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00128734, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 10.682377272314925, + "language_loss": 0.83345532, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85249484, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.5217013359069824 + }, + { + "auxiliary_loss_clip": 0.01138824, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.00213623, + "balance_loss_mlp": 1.00094223, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.9297408986403273, + "language_loss": 0.81003684, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83275247, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.575472593307495 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.01132368, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00085616, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.08489811721723, + "language_loss": 0.71510857, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73751748, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.6526639461517334 + }, + { + "auxiliary_loss_clip": 0.01155519, + "auxiliary_loss_mlp": 0.01133458, + "balance_loss_clip": 1.00215805, + "balance_loss_mlp": 1.00108767, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.6074692921621854, + "language_loss": 0.79248917, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8153789, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.591557264328003 + }, + { + "auxiliary_loss_clip": 0.01122504, + "auxiliary_loss_mlp": 0.01134222, + "balance_loss_clip": 1.00213623, + "balance_loss_mlp": 1.00099373, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.81911540013127, + "language_loss": 0.72383583, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.7464031, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 4.085792541503906 + }, + { + "auxiliary_loss_clip": 0.01125209, + "auxiliary_loss_mlp": 0.01132338, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00073099, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.022360312280183, + "language_loss": 0.88501179, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90758729, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.6078085899353027 + }, + { + "auxiliary_loss_clip": 0.01108401, + "auxiliary_loss_mlp": 0.01133422, + "balance_loss_clip": 1.00221264, + "balance_loss_mlp": 1.00105214, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9807002834657639, + "language_loss": 0.78561521, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80803347, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.658182144165039 + }, + { + "auxiliary_loss_clip": 0.01123795, + "auxiliary_loss_mlp": 0.01133276, + "balance_loss_clip": 1.00203764, + "balance_loss_mlp": 1.00090623, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.891417893805029, + "language_loss": 0.71209097, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73466176, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.6405179500579834 + }, + { + "auxiliary_loss_clip": 0.01157086, + "auxiliary_loss_mlp": 0.01132828, + "balance_loss_clip": 1.00221801, + "balance_loss_mlp": 1.00083947, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3686234917890223, + "language_loss": 0.81026959, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83316875, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 5.3938093185424805 + }, + { + "auxiliary_loss_clip": 0.01106847, + "auxiliary_loss_mlp": 0.01133575, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00101376, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.0408496518484665, + "language_loss": 0.83660591, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8590101, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.6830217838287354 + }, + { + "auxiliary_loss_clip": 0.0111394, + "auxiliary_loss_mlp": 0.01133444, + "balance_loss_clip": 1.00299168, + "balance_loss_mlp": 1.0010736, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.6976114508215672, + "language_loss": 0.76589787, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78837168, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.6940836906433105 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01133072, + "balance_loss_clip": 1.00219631, + "balance_loss_mlp": 1.00089264, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 1.7551325661805095, + "language_loss": 0.77595311, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.7985273, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.5867414474487305 + }, + { + "auxiliary_loss_clip": 0.01155605, + "auxiliary_loss_mlp": 0.01133353, + "balance_loss_clip": 1.00218213, + "balance_loss_mlp": 1.00107861, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.8239181048151194, + "language_loss": 0.67896485, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70185441, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.5393903255462646 + }, + { + "auxiliary_loss_clip": 0.01172197, + "auxiliary_loss_mlp": 0.01133098, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00091922, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0221545530097687, + "language_loss": 0.83323389, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85628688, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 2.466470241546631 + }, + { + "auxiliary_loss_clip": 0.01161768, + "auxiliary_loss_mlp": 0.01133845, + "balance_loss_clip": 1.00245428, + "balance_loss_mlp": 1.00090313, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.8667756539418443, + "language_loss": 0.5900563, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61301243, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.5494658946990967 + }, + { + "auxiliary_loss_clip": 0.01124095, + "auxiliary_loss_mlp": 0.01133334, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00096416, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.2519500683754936, + "language_loss": 0.88011307, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.90268731, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.636589288711548 + }, + { + "auxiliary_loss_clip": 0.01145235, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_clip": 1.00242925, + "balance_loss_mlp": 1.00070953, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.7360400316866431, + "language_loss": 0.74981129, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77260113, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.598651647567749 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01132564, + "balance_loss_clip": 1.00239778, + "balance_loss_mlp": 1.000862, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.107080476348162, + "language_loss": 0.68598127, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70892429, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.493772506713867 + }, + { + "auxiliary_loss_clip": 0.01172292, + "auxiliary_loss_mlp": 0.01133261, + "balance_loss_clip": 1.0022279, + "balance_loss_mlp": 1.00079536, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 1.9546520547678117, + "language_loss": 0.65613115, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67918658, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.6126668453216553 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.01133907, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00086915, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.0816080664687697, + "language_loss": 0.75001299, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77290696, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.5599422454833984 + }, + { + "auxiliary_loss_clip": 0.011565, + "auxiliary_loss_mlp": 0.01132826, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00074172, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.6977385310233517, + "language_loss": 0.73198104, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75487429, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.5381650924682617 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01133016, + "balance_loss_clip": 1.00228441, + "balance_loss_mlp": 1.00064647, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6314859204113703, + "language_loss": 0.75506103, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77769101, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.6285178661346436 + }, + { + "auxiliary_loss_clip": 0.0114059, + "auxiliary_loss_mlp": 0.01133615, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00095916, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 1.8104034071435735, + "language_loss": 0.72799659, + "learning_rate": 3.009653168561666e-06, + "loss": 0.75073862, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.677628517150879 + }, + { + "auxiliary_loss_clip": 0.01141556, + "auxiliary_loss_mlp": 0.01133381, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00101089, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 5.1493095422603385, + "language_loss": 0.89423501, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91698432, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.5262844562530518 + }, + { + "auxiliary_loss_clip": 0.0114032, + "auxiliary_loss_mlp": 0.01132312, + "balance_loss_clip": 1.00208032, + "balance_loss_mlp": 1.00070524, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 1.75792857716335, + "language_loss": 0.75179148, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.77451777, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.586775541305542 + }, + { + "auxiliary_loss_clip": 0.01156644, + "auxiliary_loss_mlp": 0.01132453, + "balance_loss_clip": 1.00230694, + "balance_loss_mlp": 1.00075102, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.522834063573655, + "language_loss": 0.75896013, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.78185105, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.5495693683624268 + }, + { + "auxiliary_loss_clip": 0.01140362, + "auxiliary_loss_mlp": 0.01134034, + "balance_loss_clip": 1.00229836, + "balance_loss_mlp": 1.00071025, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.1301727371281225, + "language_loss": 0.87591273, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89865667, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.5826284885406494 + }, + { + "auxiliary_loss_clip": 0.01172206, + "auxiliary_loss_mlp": 0.01132962, + "balance_loss_clip": 1.00227618, + "balance_loss_mlp": 1.00078273, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.080048936395979, + "language_loss": 0.6741944, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69724607, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.5050981044769287 + }, + { + "auxiliary_loss_clip": 0.01141683, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_clip": 1.00201845, + "balance_loss_mlp": 1.00081551, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.7216331677185595, + "language_loss": 0.81352586, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83626783, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.5459232330322266 + }, + { + "auxiliary_loss_clip": 0.01139678, + "auxiliary_loss_mlp": 0.01131978, + "balance_loss_clip": 1.00202978, + "balance_loss_mlp": 1.00075197, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.4940653199605143, + "language_loss": 0.73552126, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75823772, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.5781922340393066 + }, + { + "auxiliary_loss_clip": 0.01172112, + "auxiliary_loss_mlp": 0.01132302, + "balance_loss_clip": 1.00218177, + "balance_loss_mlp": 1.00098121, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 2.017973286980073, + "language_loss": 0.71284682, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73589092, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.546588659286499 + }, + { + "auxiliary_loss_clip": 0.01157108, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_clip": 1.0023222, + "balance_loss_mlp": 1.00090706, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.5928676172590701, + "language_loss": 0.61592174, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63883227, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.748788595199585 + }, + { + "auxiliary_loss_clip": 0.01155619, + "auxiliary_loss_mlp": 0.01132813, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.0009203, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.7884472730847767, + "language_loss": 0.73544502, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75832939, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.5682313442230225 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01132827, + "balance_loss_clip": 1.00217319, + "balance_loss_mlp": 1.00074315, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.7324080449832038, + "language_loss": 0.7569865, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78003603, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.602418899536133 + }, + { + "auxiliary_loss_clip": 0.0113893, + "auxiliary_loss_mlp": 0.01133315, + "balance_loss_clip": 1.00208056, + "balance_loss_mlp": 1.00094509, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 2.0391333517012513, + "language_loss": 0.72292519, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74564767, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.6832773685455322 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.01133543, + "balance_loss_clip": 1.00212193, + "balance_loss_mlp": 1.00079179, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.336886699982142, + "language_loss": 0.65898269, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68172044, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 3.9321272373199463 + }, + { + "auxiliary_loss_clip": 0.01140345, + "auxiliary_loss_mlp": 0.01132342, + "balance_loss_clip": 1.00214458, + "balance_loss_mlp": 1.00073504, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 1.7047889606348765, + "language_loss": 0.67001009, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.69273698, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.5517995357513428 + }, + { + "auxiliary_loss_clip": 0.01138721, + "auxiliary_loss_mlp": 0.01133553, + "balance_loss_clip": 1.00211334, + "balance_loss_mlp": 1.00108743, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 1.7897916091074062, + "language_loss": 0.77037525, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79309797, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.5815770626068115 + }, + { + "auxiliary_loss_clip": 0.01155351, + "auxiliary_loss_mlp": 0.01132701, + "balance_loss_clip": 1.00211334, + "balance_loss_mlp": 1.00090349, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.2387346998670363, + "language_loss": 0.75186396, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77474451, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.602196455001831 + }, + { + "auxiliary_loss_clip": 0.01156506, + "auxiliary_loss_mlp": 0.01132033, + "balance_loss_clip": 1.00212049, + "balance_loss_mlp": 1.00071239, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.2881091877347233, + "language_loss": 0.79390132, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81678677, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.5833475589752197 + }, + { + "auxiliary_loss_clip": 0.01157029, + "auxiliary_loss_mlp": 0.01132905, + "balance_loss_clip": 1.00222933, + "balance_loss_mlp": 1.0009166, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.85006422807211, + "language_loss": 0.81604433, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83894372, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.512255907058716 + }, + { + "auxiliary_loss_clip": 0.01105371, + "auxiliary_loss_mlp": 0.01132862, + "balance_loss_clip": 1.00167644, + "balance_loss_mlp": 1.00068259, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 1.8435527431020429, + "language_loss": 0.84052265, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86290497, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 4.0257956981658936 + }, + { + "auxiliary_loss_clip": 0.01172221, + "auxiliary_loss_mlp": 0.01132879, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00089025, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.906530821228553, + "language_loss": 0.74596012, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76901114, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.4906392097473145 + }, + { + "auxiliary_loss_clip": 0.01155165, + "auxiliary_loss_mlp": 0.01133426, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00076962, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 1.9315725349930002, + "language_loss": 0.61825657, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.64114249, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.5377864837646484 + }, + { + "auxiliary_loss_clip": 0.01155533, + "auxiliary_loss_mlp": 0.01133133, + "balance_loss_clip": 1.00205386, + "balance_loss_mlp": 1.0008589, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 1.8252694446989903, + "language_loss": 0.74348265, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76636928, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.529656171798706 + }, + { + "auxiliary_loss_clip": 0.011556, + "auxiliary_loss_mlp": 0.01132597, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00070405, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.4130887637075686, + "language_loss": 0.71771336, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74059534, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.6816656589508057 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01132024, + "balance_loss_clip": 1.00219274, + "balance_loss_mlp": 1.00089407, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 1.7665207477601312, + "language_loss": 0.73962259, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76251101, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 5.375467538833618 + }, + { + "auxiliary_loss_clip": 0.01139807, + "auxiliary_loss_mlp": 0.00748461, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00143528, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 2.220103591948939, + "language_loss": 0.82627928, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84516191, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.592607021331787 + }, + { + "auxiliary_loss_clip": 0.01141432, + "auxiliary_loss_mlp": 0.01133477, + "balance_loss_clip": 1.00223029, + "balance_loss_mlp": 1.00072551, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.9223874019556275, + "language_loss": 0.66094929, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68369842, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.612348794937134 + }, + { + "auxiliary_loss_clip": 0.01155544, + "auxiliary_loss_mlp": 0.01116755, + "balance_loss_clip": 1.00315189, + "balance_loss_mlp": 1.00002563, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7715685534499087, + "language_loss": 0.61452341, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63724637, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.0394251346588135 + }, + { + "auxiliary_loss_clip": 0.01099509, + "auxiliary_loss_mlp": 0.01132626, + "balance_loss_clip": 1.0026269, + "balance_loss_mlp": 1.00101876, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 3.7935826431836017, + "language_loss": 0.80059874, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82292008, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.676175117492676 + }, + { + "auxiliary_loss_clip": 0.0112208, + "auxiliary_loss_mlp": 0.00746536, + "balance_loss_clip": 1.00280511, + "balance_loss_mlp": 1.00011218, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6728065331373162, + "language_loss": 0.56734812, + "learning_rate": 2.999887569990088e-06, + "loss": 0.5860343, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.276407241821289 + }, + { + "auxiliary_loss_clip": 0.01140534, + "auxiliary_loss_mlp": 0.01133505, + "balance_loss_clip": 1.00222993, + "balance_loss_mlp": 1.00094414, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.4853094430480656, + "language_loss": 0.72109258, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74383295, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.6239054203033447 + }, + { + "auxiliary_loss_clip": 0.01138902, + "auxiliary_loss_mlp": 0.01132377, + "balance_loss_clip": 1.001845, + "balance_loss_mlp": 1.00076997, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.5956664969964753, + "language_loss": 0.7896471, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.81235993, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.562225580215454 + }, + { + "auxiliary_loss_clip": 0.01122215, + "auxiliary_loss_mlp": 0.0113342, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00076389, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 3.2036923009518232, + "language_loss": 0.62958884, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65214515, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.6307497024536133 + }, + { + "auxiliary_loss_clip": 0.01138806, + "auxiliary_loss_mlp": 0.01132591, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.0007937, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 1.9515821872615113, + "language_loss": 0.66216063, + "learning_rate": 2.998538081402727e-06, + "loss": 0.68487465, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.5819029808044434 + }, + { + "auxiliary_loss_clip": 0.01155836, + "auxiliary_loss_mlp": 0.01132376, + "balance_loss_clip": 1.00238955, + "balance_loss_mlp": 1.00086391, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3601727666878975, + "language_loss": 0.75170392, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77458596, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.5581042766571045 + }, + { + "auxiliary_loss_clip": 0.0114004, + "auxiliary_loss_mlp": 0.01132803, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00090981, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.2615418345647056, + "language_loss": 0.7053324, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72806078, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.6521856784820557 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01132951, + "balance_loss_clip": 1.00209129, + "balance_loss_mlp": 1.00086713, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.9823116273975767, + "language_loss": 0.78215468, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80471325, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.6170380115509033 + }, + { + "auxiliary_loss_clip": 0.01144543, + "auxiliary_loss_mlp": 0.0113266, + "balance_loss_clip": 1.0022409, + "balance_loss_mlp": 1.00076711, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 2.1717862397421763, + "language_loss": 0.7524296, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77520168, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.5800352096557617 + }, + { + "auxiliary_loss_clip": 0.01108155, + "auxiliary_loss_mlp": 0.01132518, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00081539, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2264716137825857, + "language_loss": 0.82906485, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85147154, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.6264476776123047 + }, + { + "auxiliary_loss_clip": 0.01172024, + "auxiliary_loss_mlp": 0.01131452, + "balance_loss_clip": 1.00219929, + "balance_loss_mlp": 1.00060797, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.0144769741958495, + "language_loss": 0.78632486, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80935961, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.01089555, + "auxiliary_loss_mlp": 0.01131334, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.00077605, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 2.1880746417675137, + "language_loss": 0.65748572, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67969459, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.681083917617798 + }, + { + "auxiliary_loss_clip": 0.01140536, + "auxiliary_loss_mlp": 0.0113257, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00086761, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.9030910758076525, + "language_loss": 0.76874208, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79147321, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.609858512878418 + }, + { + "auxiliary_loss_clip": 0.01123692, + "auxiliary_loss_mlp": 0.01132573, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00077486, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.6269714140187548, + "language_loss": 0.80808961, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83065224, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.6206588745117188 + }, + { + "auxiliary_loss_clip": 0.01144588, + "auxiliary_loss_mlp": 0.0113185, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00072026, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.9086048736587409, + "language_loss": 0.79932737, + "learning_rate": 2.99516171119991e-06, + "loss": 0.82209176, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.6079206466674805 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01132792, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00080335, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.3068923241958026, + "language_loss": 0.73217762, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75473821, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.6712310314178467 + }, + { + "auxiliary_loss_clip": 0.01140276, + "auxiliary_loss_mlp": 0.01132321, + "balance_loss_clip": 1.00215137, + "balance_loss_mlp": 1.00090456, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.166808314697379, + "language_loss": 0.67445433, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69718027, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.5412392616271973 + }, + { + "auxiliary_loss_clip": 0.01107973, + "auxiliary_loss_mlp": 0.01132308, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00079608, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 3.921120161036275, + "language_loss": 0.69539428, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71779704, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.6609904766082764 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.00748438, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 1.00145555, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6923640820706547, + "language_loss": 0.74842167, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.7673704, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.580641984939575 + }, + { + "auxiliary_loss_clip": 0.01140134, + "auxiliary_loss_mlp": 0.01132546, + "balance_loss_clip": 1.00214314, + "balance_loss_mlp": 1.00065279, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 1.66602959032512, + "language_loss": 0.83505487, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85778171, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.5665385723114014 + }, + { + "auxiliary_loss_clip": 0.01139749, + "auxiliary_loss_mlp": 0.00748379, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00135815, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.5601262652638268, + "language_loss": 0.70299584, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7218771, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.6465115547180176 + }, + { + "auxiliary_loss_clip": 0.01140818, + "auxiliary_loss_mlp": 0.01132242, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00082588, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 5.127899206964806, + "language_loss": 0.81689322, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83962381, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.617672920227051 + }, + { + "auxiliary_loss_clip": 0.01171965, + "auxiliary_loss_mlp": 0.01131784, + "balance_loss_clip": 1.00213039, + "balance_loss_mlp": 1.00093961, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.8621164613265033, + "language_loss": 0.74079156, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76382899, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 3.9103283882141113 + }, + { + "auxiliary_loss_clip": 0.01172096, + "auxiliary_loss_mlp": 0.00748524, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00155318, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.7674618122414567, + "language_loss": 0.79490173, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81410789, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.568922519683838 + }, + { + "auxiliary_loss_clip": 0.01141855, + "auxiliary_loss_mlp": 0.01132115, + "balance_loss_clip": 1.00218451, + "balance_loss_mlp": 1.00079405, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7274635700986505, + "language_loss": 0.81172347, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83446312, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.60178542137146 + }, + { + "auxiliary_loss_clip": 0.01155894, + "auxiliary_loss_mlp": 0.00748596, + "balance_loss_clip": 1.00229132, + "balance_loss_mlp": 1.00153446, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.8328148611065171, + "language_loss": 0.75793189, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77697682, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.550414800643921 + }, + { + "auxiliary_loss_clip": 0.01155318, + "auxiliary_loss_mlp": 0.0113153, + "balance_loss_clip": 1.00212991, + "balance_loss_mlp": 1.00068581, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.8744204979580323, + "language_loss": 0.7056756, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72854412, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.5587992668151855 + }, + { + "auxiliary_loss_clip": 0.01155478, + "auxiliary_loss_mlp": 0.01132586, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.0006932, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.8098367701302274, + "language_loss": 0.75126982, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77415049, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.5173699855804443 + }, + { + "auxiliary_loss_clip": 0.01138832, + "auxiliary_loss_mlp": 0.00748433, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.0014689, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0888075807244895, + "language_loss": 0.78690076, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80577338, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 3.9144649505615234 + }, + { + "auxiliary_loss_clip": 0.0112432, + "auxiliary_loss_mlp": 0.01130877, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00089085, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.8267355800814338, + "language_loss": 0.72375226, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74630421, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.5876638889312744 + }, + { + "auxiliary_loss_clip": 0.01140452, + "auxiliary_loss_mlp": 0.01132399, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00060129, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0801197227242594, + "language_loss": 0.74873078, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77145922, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.595336675643921 + }, + { + "auxiliary_loss_clip": 0.01094381, + "auxiliary_loss_mlp": 0.0113204, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00062346, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.674240022811264, + "language_loss": 0.75203556, + "learning_rate": 2.989413228164047e-06, + "loss": 0.7742998, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 2.7608022689819336 + }, + { + "auxiliary_loss_clip": 0.01138372, + "auxiliary_loss_mlp": 0.01131748, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00080848, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.729118752439465, + "language_loss": 0.68362045, + "learning_rate": 2.989074743819502e-06, + "loss": 0.7063216, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.622051477432251 + }, + { + "auxiliary_loss_clip": 0.0115513, + "auxiliary_loss_mlp": 0.01131308, + "balance_loss_clip": 1.00221407, + "balance_loss_mlp": 1.00084579, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.6761947342428103, + "language_loss": 0.78078401, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80364841, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 3.9223358631134033 + }, + { + "auxiliary_loss_clip": 0.01141553, + "auxiliary_loss_mlp": 0.01132969, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00079012, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.6846432257530788, + "language_loss": 0.70730191, + "learning_rate": 2.98839766262581e-06, + "loss": 0.73004711, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 4.1469151973724365 + }, + { + "auxiliary_loss_clip": 0.01156653, + "auxiliary_loss_mlp": 0.01131293, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.0007354, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.2543251612354154, + "language_loss": 0.86981553, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89269501, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.4958832263946533 + }, + { + "auxiliary_loss_clip": 0.01140063, + "auxiliary_loss_mlp": 0.01131906, + "balance_loss_clip": 1.00201988, + "balance_loss_mlp": 1.00067997, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.955165077333272, + "language_loss": 0.76971495, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79243469, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.5946671962738037 + }, + { + "auxiliary_loss_clip": 0.01125062, + "auxiliary_loss_mlp": 0.0113186, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.00063443, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3219020745644317, + "language_loss": 0.82431477, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84688401, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.5904250144958496 + }, + { + "auxiliary_loss_clip": 0.01172002, + "auxiliary_loss_mlp": 0.01132258, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00065148, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 6.668656718673762, + "language_loss": 0.7097649, + "learning_rate": 2.98704305057949e-06, + "loss": 0.73280746, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.595381736755371 + }, + { + "auxiliary_loss_clip": 0.01154712, + "auxiliary_loss_mlp": 0.01131161, + "balance_loss_clip": 1.00190461, + "balance_loss_mlp": 1.00069869, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.5942092923271793, + "language_loss": 0.75963914, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78249788, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.519918203353882 + }, + { + "auxiliary_loss_clip": 0.01125998, + "auxiliary_loss_mlp": 0.01132197, + "balance_loss_clip": 1.00208604, + "balance_loss_mlp": 1.00068557, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 4.2210375533326285, + "language_loss": 0.88381004, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90639198, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.6176013946533203 + }, + { + "auxiliary_loss_clip": 0.01092927, + "auxiliary_loss_mlp": 0.0113163, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00059497, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.3392626105206005, + "language_loss": 0.74768329, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76992881, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.667599678039551 + }, + { + "auxiliary_loss_clip": 0.01153195, + "auxiliary_loss_mlp": 0.01117273, + "balance_loss_clip": 1.00275278, + "balance_loss_mlp": 1.00054288, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9156901839233628, + "language_loss": 0.63847315, + "learning_rate": 2.985687839672857e-06, + "loss": 0.66117775, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.8825416564941406 + }, + { + "auxiliary_loss_clip": 0.01155675, + "auxiliary_loss_mlp": 0.01132425, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00081754, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 1.9043712366629102, + "language_loss": 0.73722696, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76010799, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.553745985031128 + }, + { + "auxiliary_loss_clip": 0.01124053, + "auxiliary_loss_mlp": 0.01132946, + "balance_loss_clip": 1.00220573, + "balance_loss_mlp": 1.00076652, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.929172995386634, + "language_loss": 0.7687425, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79131246, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.629709243774414 + }, + { + "auxiliary_loss_clip": 0.01139905, + "auxiliary_loss_mlp": 0.01131614, + "balance_loss_clip": 1.00186837, + "balance_loss_mlp": 1.00057888, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 1.9643702833023464, + "language_loss": 0.68079782, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.70351303, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.5446956157684326 + }, + { + "auxiliary_loss_clip": 0.01156858, + "auxiliary_loss_mlp": 0.01132166, + "balance_loss_clip": 1.00224996, + "balance_loss_mlp": 1.00074959, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.843080714733303, + "language_loss": 0.79085803, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81374824, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.5472829341888428 + }, + { + "auxiliary_loss_clip": 0.01138627, + "auxiliary_loss_mlp": 0.0113156, + "balance_loss_clip": 1.00215614, + "balance_loss_mlp": 1.00081098, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 2.0286258489399898, + "language_loss": 0.85310435, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87580627, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.616612672805786 + }, + { + "auxiliary_loss_clip": 0.01141511, + "auxiliary_loss_mlp": 0.01132779, + "balance_loss_clip": 1.00209963, + "balance_loss_mlp": 1.00079012, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 3.004487165931586, + "language_loss": 0.77743065, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.80017352, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.7112879753112793 + }, + { + "auxiliary_loss_clip": 0.01091004, + "auxiliary_loss_mlp": 0.01131686, + "balance_loss_clip": 1.00178158, + "balance_loss_mlp": 1.00074649, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.8008354183361097, + "language_loss": 0.75533807, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.777565, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.6717190742492676 + }, + { + "auxiliary_loss_clip": 0.01124189, + "auxiliary_loss_mlp": 0.00748422, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00137305, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0747387664211545, + "language_loss": 0.69795632, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71668237, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.655489921569824 + }, + { + "auxiliary_loss_clip": 0.01171928, + "auxiliary_loss_mlp": 0.0113152, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00067568, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 2.6566315779210967, + "language_loss": 0.7987572, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.82179159, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.50354266166687 + }, + { + "auxiliary_loss_clip": 0.0117203, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00091648, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4124223473059307, + "language_loss": 0.81948352, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.5497236251831055 + }, + { + "auxiliary_loss_clip": 0.01155267, + "auxiliary_loss_mlp": 0.01131953, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.00082314, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.6400258509326706, + "language_loss": 0.70032769, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7231999, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.51831316947937 + }, + { + "auxiliary_loss_clip": 0.01155383, + "auxiliary_loss_mlp": 0.01132494, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00098276, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 3.00108648785177, + "language_loss": 0.67501688, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69789565, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.566075086593628 + }, + { + "auxiliary_loss_clip": 0.0115514, + "auxiliary_loss_mlp": 0.01131673, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.00082886, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.6514409663518392, + "language_loss": 0.67715079, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70001894, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.5669994354248047 + }, + { + "auxiliary_loss_clip": 0.01104813, + "auxiliary_loss_mlp": 0.01131958, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00082755, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.3748781665258263, + "language_loss": 0.77792692, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80029464, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.6162824630737305 + }, + { + "auxiliary_loss_clip": 0.01156784, + "auxiliary_loss_mlp": 0.01132604, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.0009017, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.3631553018006004, + "language_loss": 0.69382226, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71671617, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.6399667263031006 + }, + { + "auxiliary_loss_clip": 0.0114024, + "auxiliary_loss_mlp": 0.00748484, + "balance_loss_clip": 1.00222516, + "balance_loss_mlp": 1.00131416, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 2.2138546043345797, + "language_loss": 0.70886707, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.72775435, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.5846166610717773 + }, + { + "auxiliary_loss_clip": 0.0112464, + "auxiliary_loss_mlp": 0.01131712, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.00067711, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 1.9405728311391215, + "language_loss": 0.78192323, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80448675, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 3.9936366081237793 + }, + { + "auxiliary_loss_clip": 0.01155655, + "auxiliary_loss_mlp": 0.00748292, + "balance_loss_clip": 1.0020982, + "balance_loss_mlp": 1.00122201, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 5.411288718840065, + "language_loss": 0.64633161, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66537106, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.576911449432373 + }, + { + "auxiliary_loss_clip": 0.01171985, + "auxiliary_loss_mlp": 0.00748286, + "balance_loss_clip": 1.00210786, + "balance_loss_mlp": 1.00127983, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.425175580294328, + "language_loss": 0.78043222, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.79963499, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.478900194168091 + }, + { + "auxiliary_loss_clip": 0.01122219, + "auxiliary_loss_mlp": 0.01132501, + "balance_loss_clip": 1.00234628, + "balance_loss_mlp": 1.00089347, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.655836954724061, + "language_loss": 0.80526352, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82781076, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.6658833026885986 + }, + { + "auxiliary_loss_clip": 0.01145223, + "auxiliary_loss_mlp": 0.01132951, + "balance_loss_clip": 1.00220394, + "balance_loss_mlp": 1.00067627, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.7095383176131371, + "language_loss": 0.78860664, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81138837, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.6081459522247314 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.01132188, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00058126, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 1.7999041965229672, + "language_loss": 0.72635502, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74907082, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.5573160648345947 + }, + { + "auxiliary_loss_clip": 0.0115635, + "auxiliary_loss_mlp": 0.01132684, + "balance_loss_clip": 1.00220215, + "balance_loss_mlp": 1.00079107, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 2.1693815108903074, + "language_loss": 0.63883507, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66172534, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.6042044162750244 + }, + { + "auxiliary_loss_clip": 0.01155367, + "auxiliary_loss_mlp": 0.01132318, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00090194, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 1.9940957328447528, + "language_loss": 0.74238664, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76526356, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 3.994595527648926 + }, + { + "auxiliary_loss_clip": 0.01169928, + "auxiliary_loss_mlp": 0.01116175, + "balance_loss_clip": 1.00273871, + "balance_loss_mlp": 1.00020838, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7843890811110423, + "language_loss": 0.6066314, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.6294924, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.197124719619751 + }, + { + "auxiliary_loss_clip": 0.01138815, + "auxiliary_loss_mlp": 0.01132147, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00073063, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 1.8590762033379786, + "language_loss": 0.72399831, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74670792, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.5449254512786865 + }, + { + "auxiliary_loss_clip": 0.01139489, + "auxiliary_loss_mlp": 0.00748253, + "balance_loss_clip": 1.00190914, + "balance_loss_mlp": 1.00114226, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.6518369559343595, + "language_loss": 0.80972433, + "learning_rate": 2.976524564880326e-06, + "loss": 0.82860172, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.01172189, + "auxiliary_loss_mlp": 0.01133057, + "balance_loss_clip": 1.00225043, + "balance_loss_mlp": 1.00097275, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.4601219286929026, + "language_loss": 0.6873374, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71038985, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 3.902618646621704 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.01131429, + "balance_loss_clip": 1.00207603, + "balance_loss_mlp": 1.00087094, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.5842758302017343, + "language_loss": 0.75992757, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.78264213, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 3.9981255531311035 + }, + { + "auxiliary_loss_clip": 0.01090484, + "auxiliary_loss_mlp": 0.01131998, + "balance_loss_clip": 1.00168729, + "balance_loss_mlp": 1.00105822, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 1.763218388375544, + "language_loss": 0.70903742, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.73126233, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.749817132949829 + }, + { + "auxiliary_loss_clip": 0.01138942, + "auxiliary_loss_mlp": 0.01132181, + "balance_loss_clip": 1.00201833, + "balance_loss_mlp": 1.00085998, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.7668753525683976, + "language_loss": 0.77394545, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79665667, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.54746675491333 + }, + { + "auxiliary_loss_clip": 0.01156488, + "auxiliary_loss_mlp": 0.01132033, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00080717, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 2.043181098780007, + "language_loss": 0.7272011, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.75008631, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.5389342308044434 + }, + { + "auxiliary_loss_clip": 0.01155277, + "auxiliary_loss_mlp": 0.01132374, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00076675, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.0425825638256114, + "language_loss": 0.70043123, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72330779, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.582181453704834 + }, + { + "auxiliary_loss_clip": 0.01107408, + "auxiliary_loss_mlp": 0.0113154, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00079083, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 3.000885220962987, + "language_loss": 0.69229031, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71467984, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.8321621417999268 + }, + { + "auxiliary_loss_clip": 0.01139499, + "auxiliary_loss_mlp": 0.01131496, + "balance_loss_clip": 1.00200224, + "balance_loss_mlp": 1.00065243, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.7689860232442036, + "language_loss": 0.66572666, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68843657, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.5988411903381348 + }, + { + "auxiliary_loss_clip": 0.01138201, + "auxiliary_loss_mlp": 0.01131696, + "balance_loss_clip": 1.00200713, + "balance_loss_mlp": 1.00094724, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 2.4231268520196383, + "language_loss": 0.74857867, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77127767, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.538229465484619 + }, + { + "auxiliary_loss_clip": 0.01155798, + "auxiliary_loss_mlp": 0.01131559, + "balance_loss_clip": 1.00235868, + "balance_loss_mlp": 1.00071526, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.5038367642089356, + "language_loss": 0.75780594, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78067946, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.5580320358276367 + }, + { + "auxiliary_loss_clip": 0.01172027, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_clip": 1.00224328, + "balance_loss_mlp": 1.00078452, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.6128092407533607, + "language_loss": 0.73272038, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75575495, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.4884772300720215 + }, + { + "auxiliary_loss_clip": 0.01138711, + "auxiliary_loss_mlp": 0.01132421, + "balance_loss_clip": 1.00214505, + "balance_loss_mlp": 1.00081444, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 3.5222702077786585, + "language_loss": 0.71301591, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73572725, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.5930335521698 + }, + { + "auxiliary_loss_clip": 0.01121206, + "auxiliary_loss_mlp": 0.01131471, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00062656, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.8024735141079764, + "language_loss": 0.88576394, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90829068, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.6439380645751953 + }, + { + "auxiliary_loss_clip": 0.01172039, + "auxiliary_loss_mlp": 0.01131651, + "balance_loss_clip": 1.00224411, + "balance_loss_mlp": 1.0009973, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.4161407503328722, + "language_loss": 0.58145821, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60449517, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.5602431297302246 + }, + { + "auxiliary_loss_clip": 0.01171939, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.00205886, + "balance_loss_mlp": 1.00080645, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 3.9455489915687556, + "language_loss": 0.76397741, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78701234, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.4666268825531006 + }, + { + "auxiliary_loss_clip": 0.01121926, + "auxiliary_loss_mlp": 0.01131913, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00078273, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.6215614648469918, + "language_loss": 0.70237982, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72491825, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.716909170150757 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_clip": 1.00211668, + "balance_loss_mlp": 1.00080919, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5659382310750884, + "language_loss": 0.74761403, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77033329, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.5721569061279297 + }, + { + "auxiliary_loss_clip": 0.01172043, + "auxiliary_loss_mlp": 0.01132076, + "balance_loss_clip": 1.00226223, + "balance_loss_mlp": 1.00075543, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.6773359291959493, + "language_loss": 0.78867996, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.81172115, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.5340473651885986 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01132041, + "balance_loss_clip": 1.00230908, + "balance_loss_mlp": 1.00062466, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.7908381761786685, + "language_loss": 0.66491866, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68763041, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.644131898880005 + }, + { + "auxiliary_loss_clip": 0.01171955, + "auxiliary_loss_mlp": 0.00748361, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00120413, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.5524525176328243, + "language_loss": 0.78841543, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80761856, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.6033618450164795 + }, + { + "auxiliary_loss_clip": 0.01108768, + "auxiliary_loss_mlp": 0.01131915, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00088, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 2.169148585809897, + "language_loss": 0.91179329, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93420005, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.6483750343322754 + }, + { + "auxiliary_loss_clip": 0.01123422, + "auxiliary_loss_mlp": 0.01131766, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00101686, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.714945309750713, + "language_loss": 0.80553514, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82808697, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.6141629219055176 + }, + { + "auxiliary_loss_clip": 0.01139666, + "auxiliary_loss_mlp": 0.0113247, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.0009582, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0672684068978353, + "language_loss": 0.84625316, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86897457, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.5427281856536865 + }, + { + "auxiliary_loss_clip": 0.01123469, + "auxiliary_loss_mlp": 0.01131826, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00079143, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.7707952859534972, + "language_loss": 0.71811652, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74066949, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.6947638988494873 + }, + { + "auxiliary_loss_clip": 0.01139748, + "auxiliary_loss_mlp": 0.01131985, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00085449, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.5448480964709195, + "language_loss": 0.79153562, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81425291, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.5637400150299072 + }, + { + "auxiliary_loss_clip": 0.01107817, + "auxiliary_loss_mlp": 0.01131799, + "balance_loss_clip": 1.00178254, + "balance_loss_mlp": 1.00076437, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.7271908631889494, + "language_loss": 0.77969438, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80209053, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.6145858764648438 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01131923, + "balance_loss_clip": 1.00198233, + "balance_loss_mlp": 1.00069785, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 1.9032946031159814, + "language_loss": 0.81473923, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83729053, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.595062494277954 + }, + { + "auxiliary_loss_clip": 0.01140466, + "auxiliary_loss_mlp": 0.01116246, + "balance_loss_clip": 1.00314522, + "balance_loss_mlp": 1.00027931, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9177653953690061, + "language_loss": 0.56780744, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.59037459, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 4.398712396621704 + }, + { + "auxiliary_loss_clip": 0.01156476, + "auxiliary_loss_mlp": 0.01131764, + "balance_loss_clip": 1.00221956, + "balance_loss_mlp": 1.0009197, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.8158842320366413, + "language_loss": 0.68814933, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.71103173, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.553546667098999 + }, + { + "auxiliary_loss_clip": 0.01172056, + "auxiliary_loss_mlp": 0.01131187, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00091481, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.632993076301717, + "language_loss": 0.80285251, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82588494, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.536210060119629 + }, + { + "auxiliary_loss_clip": 0.01092167, + "auxiliary_loss_mlp": 0.0113114, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00067759, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 4.239866381885339, + "language_loss": 0.78825629, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81048936, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.670020580291748 + }, + { + "auxiliary_loss_clip": 0.01121873, + "auxiliary_loss_mlp": 0.01131342, + "balance_loss_clip": 1.00182021, + "balance_loss_mlp": 1.00087953, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 2.9272013130804395, + "language_loss": 0.7967059, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.81923807, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.6290552616119385 + }, + { + "auxiliary_loss_clip": 0.01171998, + "auxiliary_loss_mlp": 0.00748323, + "balance_loss_clip": 1.00220752, + "balance_loss_mlp": 1.0011214, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5227996574113731, + "language_loss": 0.675699, + "learning_rate": 2.965288372816436e-06, + "loss": 0.6949023, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.595659017562866 + }, + { + "auxiliary_loss_clip": 0.01141846, + "auxiliary_loss_mlp": 0.0113186, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00082493, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.0839101921976004, + "language_loss": 0.67066056, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69339764, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.5843615531921387 + }, + { + "auxiliary_loss_clip": 0.011398, + "auxiliary_loss_mlp": 0.01132319, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.00071216, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 2.5452663858915208, + "language_loss": 0.7156297, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73835087, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 4.0565025806427 + }, + { + "auxiliary_loss_clip": 0.01138415, + "auxiliary_loss_mlp": 0.01131627, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00078249, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.8804033170703096, + "language_loss": 0.71388471, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73658514, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.648747205734253 + }, + { + "auxiliary_loss_clip": 0.01156584, + "auxiliary_loss_mlp": 0.01131507, + "balance_loss_clip": 1.00216401, + "balance_loss_mlp": 1.00094938, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.8305700291962603, + "language_loss": 0.76113194, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78401279, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.541131019592285 + }, + { + "auxiliary_loss_clip": 0.01171996, + "auxiliary_loss_mlp": 0.01132081, + "balance_loss_clip": 1.00225079, + "balance_loss_mlp": 1.00085497, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 1.939514273599559, + "language_loss": 0.76127732, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78431809, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.516400098800659 + }, + { + "auxiliary_loss_clip": 0.01155312, + "auxiliary_loss_mlp": 0.00748225, + "balance_loss_clip": 1.00214732, + "balance_loss_mlp": 1.00103581, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 2.025646527563842, + "language_loss": 0.86081749, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.87985289, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 3.961303949356079 + }, + { + "auxiliary_loss_clip": 0.01155222, + "auxiliary_loss_mlp": 0.0113135, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00098264, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.4243531404912189, + "language_loss": 0.72494197, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74780774, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 3.9401988983154297 + }, + { + "auxiliary_loss_clip": 0.01125107, + "auxiliary_loss_mlp": 0.01132564, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00076604, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.7050214198223856, + "language_loss": 0.73533201, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75790876, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.614424228668213 + }, + { + "auxiliary_loss_clip": 0.0117207, + "auxiliary_loss_mlp": 0.011319, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.0005796, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.7510368949519497, + "language_loss": 0.69960093, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.72264063, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.4796440601348877 + }, + { + "auxiliary_loss_clip": 0.0115553, + "auxiliary_loss_mlp": 0.01131694, + "balance_loss_clip": 1.0021807, + "balance_loss_mlp": 1.00075436, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 2.063714855106816, + "language_loss": 0.7319445, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75481671, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.5220510959625244 + }, + { + "auxiliary_loss_clip": 0.01125234, + "auxiliary_loss_mlp": 0.01131296, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00073814, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.5840492565902864, + "language_loss": 0.80166596, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82423127, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.658926010131836 + }, + { + "auxiliary_loss_clip": 0.01155324, + "auxiliary_loss_mlp": 0.01131974, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.00065327, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 2.174348548168174, + "language_loss": 0.83661646, + "learning_rate": 2.961192577338698e-06, + "loss": 0.85948944, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.01145355, + "auxiliary_loss_mlp": 0.01132383, + "balance_loss_clip": 1.00258005, + "balance_loss_mlp": 1.00096631, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.8820278905350376, + "language_loss": 0.75795829, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78073567, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.577904224395752 + }, + { + "auxiliary_loss_clip": 0.01171989, + "auxiliary_loss_mlp": 0.01131906, + "balance_loss_clip": 1.00226974, + "balance_loss_mlp": 1.00096691, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 2.3269354072837074, + "language_loss": 0.77550381, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79854274, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.490726947784424 + }, + { + "auxiliary_loss_clip": 0.01140233, + "auxiliary_loss_mlp": 0.01132197, + "balance_loss_clip": 1.00208199, + "balance_loss_mlp": 1.00078082, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.8215813089870974, + "language_loss": 0.74443585, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76716018, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.55230975151062 + }, + { + "auxiliary_loss_clip": 0.01104972, + "auxiliary_loss_mlp": 0.01131839, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00061369, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 2.03650347280664, + "language_loss": 0.68767136, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.7100395, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.621941089630127 + }, + { + "auxiliary_loss_clip": 0.01139935, + "auxiliary_loss_mlp": 0.01131427, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00077391, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7505010979382039, + "language_loss": 0.82529271, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84800637, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.581514358520508 + }, + { + "auxiliary_loss_clip": 0.01172092, + "auxiliary_loss_mlp": 0.01131894, + "balance_loss_clip": 1.00227952, + "balance_loss_mlp": 1.00085962, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.6827913455202375, + "language_loss": 0.74087846, + "learning_rate": 2.959142709981763e-06, + "loss": 0.7639184, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.459810256958008 + }, + { + "auxiliary_loss_clip": 0.01161323, + "auxiliary_loss_mlp": 0.01131149, + "balance_loss_clip": 1.0023433, + "balance_loss_mlp": 1.00068688, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 3.279348315324208, + "language_loss": 0.68738103, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71030569, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.5054616928100586 + }, + { + "auxiliary_loss_clip": 0.01108359, + "auxiliary_loss_mlp": 0.01131074, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00080204, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.027719072608287, + "language_loss": 0.76595485, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.78834915, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.6092402935028076 + }, + { + "auxiliary_loss_clip": 0.01123974, + "auxiliary_loss_mlp": 0.01132295, + "balance_loss_clip": 1.00220299, + "balance_loss_mlp": 1.00078368, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.5899893309831947, + "language_loss": 0.78022128, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80278397, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.586103677749634 + }, + { + "auxiliary_loss_clip": 0.01128446, + "auxiliary_loss_mlp": 0.01131782, + "balance_loss_clip": 1.00237191, + "balance_loss_mlp": 1.00084305, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6955489799699666, + "language_loss": 0.78457344, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80717564, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.5941224098205566 + }, + { + "auxiliary_loss_clip": 0.0117191, + "auxiliary_loss_mlp": 0.00748204, + "balance_loss_clip": 1.00218523, + "balance_loss_mlp": 1.00093603, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9220282124500052, + "language_loss": 0.83439171, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85359287, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.5584404468536377 + }, + { + "auxiliary_loss_clip": 0.01139426, + "auxiliary_loss_mlp": 0.01130508, + "balance_loss_clip": 1.00195217, + "balance_loss_mlp": 1.0006175, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.2989441914909894, + "language_loss": 0.90509081, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92779016, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.578462600708008 + }, + { + "auxiliary_loss_clip": 0.01120982, + "auxiliary_loss_mlp": 0.01115988, + "balance_loss_clip": 1.00298131, + "balance_loss_mlp": 1.00002134, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8677304043698308, + "language_loss": 0.53385276, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55622244, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.1317038536071777 + }, + { + "auxiliary_loss_clip": 0.01138888, + "auxiliary_loss_mlp": 0.0074832, + "balance_loss_clip": 1.00205886, + "balance_loss_mlp": 1.00099337, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.6748621355619009, + "language_loss": 0.77758288, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79645491, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.6155240535736084 + }, + { + "auxiliary_loss_clip": 0.01156976, + "auxiliary_loss_mlp": 0.01131635, + "balance_loss_clip": 1.0022496, + "balance_loss_mlp": 1.00098121, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 3.7412746232219805, + "language_loss": 0.79216069, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81504679, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.5297648906707764 + }, + { + "auxiliary_loss_clip": 0.01172072, + "auxiliary_loss_mlp": 0.01132088, + "balance_loss_clip": 1.00230801, + "balance_loss_mlp": 1.00067174, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 2.6171041667776733, + "language_loss": 0.84841353, + "learning_rate": 2.955723356106876e-06, + "loss": 0.87145507, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.491306781768799 + }, + { + "auxiliary_loss_clip": 0.01138922, + "auxiliary_loss_mlp": 0.01132345, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.000929, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 1.9672939920140187, + "language_loss": 0.72176117, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74447387, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.5730135440826416 + }, + { + "auxiliary_loss_clip": 0.01156884, + "auxiliary_loss_mlp": 0.01131465, + "balance_loss_clip": 1.00212443, + "balance_loss_mlp": 1.00081205, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 2.134459724136833, + "language_loss": 0.82764071, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85052419, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.544855833053589 + }, + { + "auxiliary_loss_clip": 0.01123536, + "auxiliary_loss_mlp": 0.01131567, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00072312, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.8325798075687836, + "language_loss": 0.76124012, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78379118, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.6000218391418457 + }, + { + "auxiliary_loss_clip": 0.01138386, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00072193, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.6197601290611392, + "language_loss": 0.83146048, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85416007, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 3.95180082321167 + }, + { + "auxiliary_loss_clip": 0.01172039, + "auxiliary_loss_mlp": 0.01131679, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00083542, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.9681949331570796, + "language_loss": 0.62658525, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64962244, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.4940173625946045 + }, + { + "auxiliary_loss_clip": 0.01138039, + "auxiliary_loss_mlp": 0.01130448, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00084364, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.7930729527136333, + "language_loss": 0.8395372, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86222208, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.5649681091308594 + }, + { + "auxiliary_loss_clip": 0.0117202, + "auxiliary_loss_mlp": 0.01131724, + "balance_loss_clip": 1.00219214, + "balance_loss_mlp": 1.00068963, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 2.5895571990795205, + "language_loss": 0.91394794, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93698537, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.4564297199249268 + }, + { + "auxiliary_loss_clip": 0.01171896, + "auxiliary_loss_mlp": 0.01131562, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00090861, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.90015035491462, + "language_loss": 0.73640132, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75943589, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.50789475440979 + }, + { + "auxiliary_loss_clip": 0.01076301, + "auxiliary_loss_mlp": 0.01132052, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00082636, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 2.3866463065663277, + "language_loss": 0.64942479, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67150825, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.7528858184814453 + }, + { + "auxiliary_loss_clip": 0.01155543, + "auxiliary_loss_mlp": 0.01131866, + "balance_loss_clip": 1.00217772, + "balance_loss_mlp": 1.00092626, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 2.0514546243945637, + "language_loss": 0.71727586, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74014992, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.669692277908325 + }, + { + "auxiliary_loss_clip": 0.01155357, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00069833, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 2.4572316902032147, + "language_loss": 0.73694783, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75982636, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.5026803016662598 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.0113178, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00084007, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.5716523563378306, + "language_loss": 0.69134724, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71388638, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 4.036471843719482 + }, + { + "auxiliary_loss_clip": 0.01139229, + "auxiliary_loss_mlp": 0.01131898, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.0006721, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.4728737376271461, + "language_loss": 0.76440263, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78711396, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.571986198425293 + }, + { + "auxiliary_loss_clip": 0.01161495, + "auxiliary_loss_mlp": 0.01132261, + "balance_loss_clip": 1.00237513, + "balance_loss_mlp": 1.00084496, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.19774297736116, + "language_loss": 0.74046296, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.7634005, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.549272060394287 + }, + { + "auxiliary_loss_clip": 0.01123823, + "auxiliary_loss_mlp": 0.01131619, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00087023, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.77095142911081, + "language_loss": 0.80580688, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.82836127, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 4.061383008956909 + }, + { + "auxiliary_loss_clip": 0.0115519, + "auxiliary_loss_mlp": 0.01131417, + "balance_loss_clip": 1.00219083, + "balance_loss_mlp": 1.00095403, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.4779315339372567, + "language_loss": 0.81812572, + "learning_rate": 2.950244857154417e-06, + "loss": 0.84099174, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.559577703475952 + }, + { + "auxiliary_loss_clip": 0.0114015, + "auxiliary_loss_mlp": 0.01132332, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.0006299, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8805967759410338, + "language_loss": 0.79558253, + "learning_rate": 2.9499021441341e-06, + "loss": 0.8183074, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 3.991687536239624 + }, + { + "auxiliary_loss_clip": 0.01141309, + "auxiliary_loss_mlp": 0.01130403, + "balance_loss_clip": 1.00219929, + "balance_loss_mlp": 1.00060833, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.9822871302688172, + "language_loss": 0.75701833, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.77973551, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.564239978790283 + }, + { + "auxiliary_loss_clip": 0.01155225, + "auxiliary_loss_mlp": 0.00748252, + "balance_loss_clip": 1.00215042, + "balance_loss_mlp": 1.00096011, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 2.964463247854036, + "language_loss": 0.72288358, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74191833, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.5806539058685303 + }, + { + "auxiliary_loss_clip": 0.01154936, + "auxiliary_loss_mlp": 0.01133193, + "balance_loss_clip": 1.0021199, + "balance_loss_mlp": 1.00120425, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 3.1287625913847648, + "language_loss": 0.78739917, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81028044, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.575213670730591 + }, + { + "auxiliary_loss_clip": 0.01144585, + "auxiliary_loss_mlp": 0.01132086, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00086045, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 3.262633247068317, + "language_loss": 0.67823082, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.70099753, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.6058480739593506 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.0113188, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00094044, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.7920293203928126, + "language_loss": 0.85508001, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87763751, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.5900156497955322 + }, + { + "auxiliary_loss_clip": 0.01123014, + "auxiliary_loss_mlp": 0.01131549, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00099087, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.8406120774375718, + "language_loss": 0.72647363, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74901927, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.5987186431884766 + }, + { + "auxiliary_loss_clip": 0.01138815, + "auxiliary_loss_mlp": 0.01133003, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00072837, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.081566997566169, + "language_loss": 0.73991942, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76263767, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.5724005699157715 + }, + { + "auxiliary_loss_clip": 0.01124438, + "auxiliary_loss_mlp": 0.01131451, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00098813, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 2.310094249273277, + "language_loss": 0.73051405, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75307298, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.6538918018341064 + }, + { + "auxiliary_loss_clip": 0.01107444, + "auxiliary_loss_mlp": 0.01132431, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00111055, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.7440271925571431, + "language_loss": 0.77662325, + "learning_rate": 2.946816107593884e-06, + "loss": 0.79902202, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.6244635581970215 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01115217, + "balance_loss_clip": 1.00287652, + "balance_loss_mlp": 1.00001323, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7736522026612206, + "language_loss": 0.64817262, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.67038429, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.466686725616455 + }, + { + "auxiliary_loss_clip": 0.01156546, + "auxiliary_loss_mlp": 0.0113201, + "balance_loss_clip": 1.00213945, + "balance_loss_mlp": 1.0007844, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 2.097910688279936, + "language_loss": 0.89857024, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92145574, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.822298526763916 + }, + { + "auxiliary_loss_clip": 0.01138654, + "auxiliary_loss_mlp": 0.01132268, + "balance_loss_clip": 1.00170243, + "balance_loss_mlp": 1.00094652, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.791050568845726, + "language_loss": 0.73749399, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76020324, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.544569969177246 + }, + { + "auxiliary_loss_clip": 0.01139946, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00074339, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.728526818375749, + "language_loss": 0.75534499, + "learning_rate": 2.945443601747297e-06, + "loss": 0.77806032, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.576472282409668 + }, + { + "auxiliary_loss_clip": 0.0115644, + "auxiliary_loss_mlp": 0.01131215, + "balance_loss_clip": 1.00208819, + "balance_loss_mlp": 1.00094342, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6218088490849418, + "language_loss": 0.78629446, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80917102, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.5360138416290283 + }, + { + "auxiliary_loss_clip": 0.01153656, + "auxiliary_loss_mlp": 0.01115203, + "balance_loss_clip": 1.00286841, + "balance_loss_mlp": 0.99999952, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8580614867246528, + "language_loss": 0.63406372, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65675235, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.180507183074951 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01131474, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00101137, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.1766117950399715, + "language_loss": 0.71115661, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73386836, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.630906820297241 + }, + { + "auxiliary_loss_clip": 0.01154858, + "auxiliary_loss_mlp": 0.01132103, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00097299, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.5338451806344067, + "language_loss": 0.80876172, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83163124, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.5431711673736572 + }, + { + "auxiliary_loss_clip": 0.01139925, + "auxiliary_loss_mlp": 0.0113177, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00073552, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.664119730153417, + "language_loss": 0.83831501, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86103201, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.566202163696289 + }, + { + "auxiliary_loss_clip": 0.01145622, + "auxiliary_loss_mlp": 0.01131442, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.0009793, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 2.214282474931362, + "language_loss": 0.78352869, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80629933, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.5780134201049805 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01131766, + "balance_loss_clip": 1.00202239, + "balance_loss_mlp": 1.00092161, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 1.8437750402103332, + "language_loss": 0.65087926, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67343736, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.624910593032837 + }, + { + "auxiliary_loss_clip": 0.01140221, + "auxiliary_loss_mlp": 0.01132031, + "balance_loss_clip": 1.00222325, + "balance_loss_mlp": 1.00071001, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 2.15727330431317, + "language_loss": 0.80781335, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83053589, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.616096258163452 + }, + { + "auxiliary_loss_clip": 0.01123392, + "auxiliary_loss_mlp": 0.01131614, + "balance_loss_clip": 1.00205576, + "balance_loss_mlp": 1.00086558, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.918847585631809, + "language_loss": 0.64657331, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66912329, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.7142438888549805 + }, + { + "auxiliary_loss_clip": 0.01122544, + "auxiliary_loss_mlp": 0.01131666, + "balance_loss_clip": 1.0019933, + "balance_loss_mlp": 1.00063086, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.6641799772388055, + "language_loss": 0.77624929, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79879141, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.639150619506836 + }, + { + "auxiliary_loss_clip": 0.01156656, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_clip": 1.00200284, + "balance_loss_mlp": 1.00088692, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.9169763510449194, + "language_loss": 0.79317826, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81607354, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.5644683837890625 + }, + { + "auxiliary_loss_clip": 0.01153118, + "auxiliary_loss_mlp": 0.01116102, + "balance_loss_clip": 1.00306916, + "balance_loss_mlp": 1.00013566, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564094664278537, + "language_loss": 0.52568197, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54837418, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 4.787544250488281 + }, + { + "auxiliary_loss_clip": 0.01125652, + "auxiliary_loss_mlp": 0.01131476, + "balance_loss_clip": 1.00206649, + "balance_loss_mlp": 1.00063205, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 1.9769299937184777, + "language_loss": 0.86451942, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88709068, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.639756441116333 + }, + { + "auxiliary_loss_clip": 0.01155398, + "auxiliary_loss_mlp": 0.00748093, + "balance_loss_clip": 1.00200188, + "balance_loss_mlp": 1.00070357, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 2.050020522529478, + "language_loss": 0.78441525, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80345017, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.535087823867798 + }, + { + "auxiliary_loss_clip": 0.01156268, + "auxiliary_loss_mlp": 0.01130774, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00078845, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.8340660959857713, + "language_loss": 0.82495475, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84782517, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.562697172164917 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00098836, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.8457949502580349, + "language_loss": 0.72397125, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74650681, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.6332855224609375 + }, + { + "auxiliary_loss_clip": 0.01092279, + "auxiliary_loss_mlp": 0.01115468, + "balance_loss_clip": 1.00242329, + "balance_loss_mlp": 1.00026405, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.8173934485414404, + "language_loss": 0.61217535, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63425279, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.254868507385254 + }, + { + "auxiliary_loss_clip": 0.01140266, + "auxiliary_loss_mlp": 0.01132124, + "balance_loss_clip": 1.00220716, + "balance_loss_mlp": 1.00080264, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 3.5337814235007143, + "language_loss": 0.75969493, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78241885, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.6393566131591797 + }, + { + "auxiliary_loss_clip": 0.0117195, + "auxiliary_loss_mlp": 0.01132081, + "balance_loss_clip": 1.00224519, + "balance_loss_mlp": 1.00085568, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.4791577546167645, + "language_loss": 0.75177824, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7748186, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 3.9917380809783936 + }, + { + "auxiliary_loss_clip": 0.0113985, + "auxiliary_loss_mlp": 0.01131163, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00089073, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 1.9287411660035383, + "language_loss": 0.80018926, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.8228994, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.5882136821746826 + }, + { + "auxiliary_loss_clip": 0.01145906, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00072169, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.4707873864696075, + "language_loss": 0.80225492, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82502961, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.643331527709961 + }, + { + "auxiliary_loss_clip": 0.01140017, + "auxiliary_loss_mlp": 0.00748087, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00072873, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.5950102222855678, + "language_loss": 0.84682429, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86570537, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.606426954269409 + }, + { + "auxiliary_loss_clip": 0.0112442, + "auxiliary_loss_mlp": 0.01132104, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00106955, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.465469850078734, + "language_loss": 0.87801963, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90058482, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 4.141587018966675 + }, + { + "auxiliary_loss_clip": 0.01155139, + "auxiliary_loss_mlp": 0.01132323, + "balance_loss_clip": 1.00218964, + "balance_loss_mlp": 1.0010972, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 1.910210038399253, + "language_loss": 0.67336613, + "learning_rate": 2.937196549795971e-06, + "loss": 0.69624078, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.5700485706329346 + }, + { + "auxiliary_loss_clip": 0.01138878, + "auxiliary_loss_mlp": 0.01132549, + "balance_loss_clip": 1.00215018, + "balance_loss_mlp": 1.00084674, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.143122686537286, + "language_loss": 0.7516607, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77437496, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 4.339061737060547 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01132044, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00072336, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.750336633779081, + "language_loss": 0.72494245, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74764973, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.6383397579193115 + }, + { + "auxiliary_loss_clip": 0.01156506, + "auxiliary_loss_mlp": 0.01131061, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00088406, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.859243906265989, + "language_loss": 0.67841387, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70128965, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.5472705364227295 + }, + { + "auxiliary_loss_clip": 0.01139172, + "auxiliary_loss_mlp": 0.01131951, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00082111, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.887886329704179, + "language_loss": 0.74296314, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76567435, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.599449872970581 + }, + { + "auxiliary_loss_clip": 0.01138493, + "auxiliary_loss_mlp": 0.01131802, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00086236, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 9.271385328426057, + "language_loss": 0.75509191, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77779484, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.6638309955596924 + }, + { + "auxiliary_loss_clip": 0.01161135, + "auxiliary_loss_mlp": 0.01131145, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00068283, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.1724396534263604, + "language_loss": 0.76621175, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.7891345, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.5167527198791504 + }, + { + "auxiliary_loss_clip": 0.01171685, + "auxiliary_loss_mlp": 0.01131126, + "balance_loss_clip": 1.00223362, + "balance_loss_mlp": 1.00075865, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.052040171596299, + "language_loss": 0.7071197, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73014784, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.5099637508392334 + }, + { + "auxiliary_loss_clip": 0.01156555, + "auxiliary_loss_mlp": 0.0113169, + "balance_loss_clip": 1.00204694, + "balance_loss_mlp": 1.0006547, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 2.031842569404854, + "language_loss": 0.73354518, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.75642771, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.4900963306427 + }, + { + "auxiliary_loss_clip": 0.01138421, + "auxiliary_loss_mlp": 0.01130871, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.00079012, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 2.741177886526614, + "language_loss": 0.6631493, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68584228, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.6838693618774414 + }, + { + "auxiliary_loss_clip": 0.01155292, + "auxiliary_loss_mlp": 0.01130678, + "balance_loss_clip": 1.00199711, + "balance_loss_mlp": 1.00069261, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.7659090359123741, + "language_loss": 0.74020678, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76306653, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.5419671535491943 + }, + { + "auxiliary_loss_clip": 0.01156137, + "auxiliary_loss_mlp": 0.01131226, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00076377, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.744157848634134, + "language_loss": 0.8825326, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90540624, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.572417736053467 + }, + { + "auxiliary_loss_clip": 0.01154871, + "auxiliary_loss_mlp": 0.01131474, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00091624, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 1.7532470978561674, + "language_loss": 0.72767258, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.75053596, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.5468952655792236 + }, + { + "auxiliary_loss_clip": 0.01089808, + "auxiliary_loss_mlp": 0.01132219, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00080228, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8349114358546301, + "language_loss": 0.66605818, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68827844, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.6915183067321777 + }, + { + "auxiliary_loss_clip": 0.01123072, + "auxiliary_loss_mlp": 0.01131059, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00069201, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.4888083215986634, + "language_loss": 0.73126531, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75380665, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.6764020919799805 + }, + { + "auxiliary_loss_clip": 0.01123187, + "auxiliary_loss_mlp": 0.01132424, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00091219, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.1999966130088757, + "language_loss": 0.89845324, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.92100942, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.607970714569092 + }, + { + "auxiliary_loss_clip": 0.01154815, + "auxiliary_loss_mlp": 0.01131267, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00089943, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 3.24272636985009, + "language_loss": 0.6958971, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71875787, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 2.5126736164093018 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01116345, + "balance_loss_clip": 1.00280738, + "balance_loss_mlp": 1.00037825, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7560510508802718, + "language_loss": 0.61781621, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.64067781, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.1885833740234375 + }, + { + "auxiliary_loss_clip": 0.01140815, + "auxiliary_loss_mlp": 0.01131659, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.00100541, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.9197237135311735, + "language_loss": 0.78066456, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80338931, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 2.591578960418701 + }, + { + "auxiliary_loss_clip": 0.01155031, + "auxiliary_loss_mlp": 0.01131256, + "balance_loss_clip": 1.00218618, + "balance_loss_mlp": 1.0007937, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 1.7154837989728458, + "language_loss": 0.62798131, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.6508441, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.72314453125 + }, + { + "auxiliary_loss_clip": 0.01124336, + "auxiliary_loss_mlp": 0.01132384, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00077677, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4320666602212744, + "language_loss": 0.67686498, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69943219, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.6464059352874756 + }, + { + "auxiliary_loss_clip": 0.01138828, + "auxiliary_loss_mlp": 0.0074813, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.00073779, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.6451406045824426, + "language_loss": 0.74739468, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76626432, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.645578145980835 + }, + { + "auxiliary_loss_clip": 0.01091913, + "auxiliary_loss_mlp": 0.00747939, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00057554, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8618236697793176, + "language_loss": 0.82936108, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84775954, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.763659954071045 + }, + { + "auxiliary_loss_clip": 0.01137771, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_clip": 1.0025034, + "balance_loss_mlp": 1.00006723, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.815200175270381, + "language_loss": 0.59329438, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61583239, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.272170305252075 + }, + { + "auxiliary_loss_clip": 0.01125198, + "auxiliary_loss_mlp": 0.01131067, + "balance_loss_clip": 1.00226712, + "balance_loss_mlp": 1.00098598, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.1853919297048634, + "language_loss": 0.72992003, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75248271, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.636829376220703 + }, + { + "auxiliary_loss_clip": 0.01121414, + "auxiliary_loss_mlp": 0.01131166, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00089455, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 1.9061775752032595, + "language_loss": 0.7798245, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80235028, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 4.063299655914307 + }, + { + "auxiliary_loss_clip": 0.01156311, + "auxiliary_loss_mlp": 0.0113114, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00058246, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 2.0412776199436076, + "language_loss": 0.76265651, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.78553104, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.618643045425415 + }, + { + "auxiliary_loss_clip": 0.01106719, + "auxiliary_loss_mlp": 0.01132658, + "balance_loss_clip": 1.00219405, + "balance_loss_mlp": 1.0008601, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.1847035110359534, + "language_loss": 0.70655036, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72894412, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.6632959842681885 + }, + { + "auxiliary_loss_clip": 0.01155064, + "auxiliary_loss_mlp": 0.01131605, + "balance_loss_clip": 1.00201869, + "balance_loss_mlp": 1.00057054, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.9360557884000837, + "language_loss": 0.79528624, + "learning_rate": 2.92754912981472e-06, + "loss": 0.8181529, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.7133383750915527 + }, + { + "auxiliary_loss_clip": 0.01124326, + "auxiliary_loss_mlp": 0.01130433, + "balance_loss_clip": 1.0019449, + "balance_loss_mlp": 1.000543, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.8715587891564163, + "language_loss": 0.70898044, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73152804, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.619908332824707 + }, + { + "auxiliary_loss_clip": 0.01138487, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_clip": 1.00222611, + "balance_loss_mlp": 1.00111985, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 2.460113398253539, + "language_loss": 0.74254704, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76525062, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.551849126815796 + }, + { + "auxiliary_loss_clip": 0.01096276, + "auxiliary_loss_mlp": 0.01130789, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00080299, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.9921253283295557, + "language_loss": 0.72919172, + "learning_rate": 2.926513837074284e-06, + "loss": 0.75146234, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.6946942806243896 + }, + { + "auxiliary_loss_clip": 0.01156607, + "auxiliary_loss_mlp": 0.01131404, + "balance_loss_clip": 1.00217819, + "balance_loss_mlp": 1.00084639, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.0920168603309603, + "language_loss": 0.78326267, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80614281, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.5357561111450195 + }, + { + "auxiliary_loss_clip": 0.0115492, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_clip": 1.00194108, + "balance_loss_mlp": 1.00067997, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 13.997041959833604, + "language_loss": 0.74555314, + "learning_rate": 2.925823466224696e-06, + "loss": 0.7684195, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 3.9966704845428467 + }, + { + "auxiliary_loss_clip": 0.01171877, + "auxiliary_loss_mlp": 0.01132022, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00108266, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.606074720324838, + "language_loss": 0.79268819, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81572717, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.5771305561065674 + }, + { + "auxiliary_loss_clip": 0.01144465, + "auxiliary_loss_mlp": 0.00748154, + "balance_loss_clip": 1.00214159, + "balance_loss_mlp": 1.00067687, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.7447072626974762, + "language_loss": 0.73545408, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75438023, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.5521275997161865 + }, + { + "auxiliary_loss_clip": 0.01122802, + "auxiliary_loss_mlp": 0.01131578, + "balance_loss_clip": 1.00182211, + "balance_loss_mlp": 1.00073361, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.558767657695024, + "language_loss": 0.66958559, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69212937, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 4.266164779663086 + }, + { + "auxiliary_loss_clip": 0.01089403, + "auxiliary_loss_mlp": 0.01131851, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.0008167, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 4.127175588756444, + "language_loss": 0.77787769, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.80009019, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.808504819869995 + }, + { + "auxiliary_loss_clip": 0.01156383, + "auxiliary_loss_mlp": 0.01131463, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00080967, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.653825232808024, + "language_loss": 0.73502296, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75790143, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 4.0518646240234375 + }, + { + "auxiliary_loss_clip": 0.01138096, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00080371, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.913543653893276, + "language_loss": 0.84461331, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86730027, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.547616958618164 + }, + { + "auxiliary_loss_clip": 0.01123511, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00067997, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.4133825815164465, + "language_loss": 0.70427883, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72683108, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.6475677490234375 + }, + { + "auxiliary_loss_clip": 0.01138789, + "auxiliary_loss_mlp": 0.01132144, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00091839, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0366589205953076, + "language_loss": 0.76419699, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78690624, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.6536715030670166 + }, + { + "auxiliary_loss_clip": 0.01155263, + "auxiliary_loss_mlp": 0.01132319, + "balance_loss_clip": 1.00208664, + "balance_loss_mlp": 1.00080752, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.7953864917072144, + "language_loss": 0.70171845, + "learning_rate": 2.922715061101625e-06, + "loss": 0.7245943, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.7694640159606934 + }, + { + "auxiliary_loss_clip": 0.01104733, + "auxiliary_loss_mlp": 0.01131284, + "balance_loss_clip": 1.00164473, + "balance_loss_mlp": 1.00082123, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.7300028453345668, + "language_loss": 0.72128642, + "learning_rate": 2.922369507632716e-06, + "loss": 0.74364656, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.6140267848968506 + }, + { + "auxiliary_loss_clip": 0.01156544, + "auxiliary_loss_mlp": 0.01131796, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00076151, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 1.830890534470317, + "language_loss": 0.81394911, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83683252, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.549828290939331 + }, + { + "auxiliary_loss_clip": 0.01171736, + "auxiliary_loss_mlp": 0.01132662, + "balance_loss_clip": 1.00211799, + "balance_loss_mlp": 1.00095999, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7668035959518036, + "language_loss": 0.81310719, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83615112, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.534852981567383 + }, + { + "auxiliary_loss_clip": 0.01124116, + "auxiliary_loss_mlp": 0.00746806, + "balance_loss_clip": 1.00238895, + "balance_loss_mlp": 1.00001144, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6909862668243216, + "language_loss": 0.59265935, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61136854, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.2957711219787598 + }, + { + "auxiliary_loss_clip": 0.01137642, + "auxiliary_loss_mlp": 0.01131157, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00069475, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.7012599844625946, + "language_loss": 0.74679548, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76948345, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.596282720565796 + }, + { + "auxiliary_loss_clip": 0.01155833, + "auxiliary_loss_mlp": 0.0113224, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00082397, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 1.949475985453293, + "language_loss": 0.73263752, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75551832, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.5712790489196777 + }, + { + "auxiliary_loss_clip": 0.01092858, + "auxiliary_loss_mlp": 0.01131886, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00085151, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.9083786330386885, + "language_loss": 0.53272909, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55497652, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.710172176361084 + }, + { + "auxiliary_loss_clip": 0.01156462, + "auxiliary_loss_mlp": 0.01131502, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00084901, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.9800120498263742, + "language_loss": 0.80314082, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82602048, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.600087881088257 + }, + { + "auxiliary_loss_clip": 0.0110628, + "auxiliary_loss_mlp": 0.01131833, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00079846, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.4688071699520888, + "language_loss": 0.72346485, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74584597, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.73374342918396 + }, + { + "auxiliary_loss_clip": 0.0115643, + "auxiliary_loss_mlp": 0.01131541, + "balance_loss_clip": 1.00212562, + "balance_loss_mlp": 1.00088787, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.7440505037623097, + "language_loss": 0.85072136, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87360108, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01156137, + "auxiliary_loss_mlp": 0.01132226, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00080919, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 2.02193633789429, + "language_loss": 0.78686112, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80974478, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.01156637, + "auxiliary_loss_mlp": 0.01131751, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00100231, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.6858218456354828, + "language_loss": 0.67125946, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69414335, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.531135082244873 + }, + { + "auxiliary_loss_clip": 0.01139469, + "auxiliary_loss_mlp": 0.01131103, + "balance_loss_clip": 1.00202835, + "balance_loss_mlp": 1.00073576, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.2399738229056, + "language_loss": 0.76735139, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.79005706, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.538806438446045 + }, + { + "auxiliary_loss_clip": 0.01106713, + "auxiliary_loss_mlp": 0.01131567, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00072312, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 2.148249416426982, + "language_loss": 0.63044107, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65282381, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.6479337215423584 + }, + { + "auxiliary_loss_clip": 0.01141442, + "auxiliary_loss_mlp": 0.01131785, + "balance_loss_clip": 1.00218952, + "balance_loss_mlp": 1.00075018, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.7666323270916768, + "language_loss": 0.73252201, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75525433, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.6228842735290527 + }, + { + "auxiliary_loss_clip": 0.01155089, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00109076, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 1.62747582293755, + "language_loss": 0.72530246, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.74818224, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.5324196815490723 + }, + { + "auxiliary_loss_clip": 0.01139558, + "auxiliary_loss_mlp": 0.01131536, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00078773, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.9203999427693255, + "language_loss": 0.80150712, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82421803, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.578782081604004 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01132121, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00089538, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8967103843554531, + "language_loss": 0.64425987, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66679966, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.641284227371216 + }, + { + "auxiliary_loss_clip": 0.01155006, + "auxiliary_loss_mlp": 0.01131527, + "balance_loss_clip": 1.00221479, + "balance_loss_mlp": 1.00077844, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.807086655893864, + "language_loss": 0.71518517, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73805052, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.6066997051239014 + }, + { + "auxiliary_loss_clip": 0.01139549, + "auxiliary_loss_mlp": 0.01131452, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00070322, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.7540619241294175, + "language_loss": 0.6925872, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71529722, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.646725654602051 + }, + { + "auxiliary_loss_clip": 0.01156569, + "auxiliary_loss_mlp": 0.01132221, + "balance_loss_clip": 1.0021528, + "balance_loss_mlp": 1.00090003, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.9114931725499655, + "language_loss": 0.73039758, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75328553, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 3.9857354164123535 + }, + { + "auxiliary_loss_clip": 0.01144349, + "auxiliary_loss_mlp": 0.01131595, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00075054, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.2548824620422554, + "language_loss": 0.74106777, + "learning_rate": 2.915104825441114e-06, + "loss": 0.7638272, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.6392555236816406 + }, + { + "auxiliary_loss_clip": 0.0115659, + "auxiliary_loss_mlp": 0.01133024, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00093997, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.794611787256955, + "language_loss": 0.78167635, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80457246, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.515845537185669 + }, + { + "auxiliary_loss_clip": 0.01154585, + "auxiliary_loss_mlp": 0.01132197, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00078046, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.0844208408595786, + "language_loss": 0.6549108, + "learning_rate": 2.914412150914888e-06, + "loss": 0.67777854, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.501197099685669 + }, + { + "auxiliary_loss_clip": 0.01138134, + "auxiliary_loss_mlp": 0.01131847, + "balance_loss_clip": 1.00198495, + "balance_loss_mlp": 1.00081205, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.8009019991661361, + "language_loss": 0.70290613, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72560596, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.7059710025787354 + }, + { + "auxiliary_loss_clip": 0.01139992, + "auxiliary_loss_mlp": 0.01132048, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00091791, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 1.7201883171274441, + "language_loss": 0.75395131, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77667177, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.547492265701294 + }, + { + "auxiliary_loss_clip": 0.01156297, + "auxiliary_loss_mlp": 0.01131553, + "balance_loss_clip": 1.002074, + "balance_loss_mlp": 1.00080454, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.6648307445390953, + "language_loss": 0.84422529, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86710382, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.603811025619507 + }, + { + "auxiliary_loss_clip": 0.01135639, + "auxiliary_loss_mlp": 0.01115375, + "balance_loss_clip": 1.00225043, + "balance_loss_mlp": 1.00017166, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8800217259620522, + "language_loss": 0.60254914, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62505931, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 4.660333156585693 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01131097, + "balance_loss_clip": 1.00196326, + "balance_loss_mlp": 1.00063479, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5517283281391516, + "language_loss": 0.72864056, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.7511825, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.697711706161499 + }, + { + "auxiliary_loss_clip": 0.01156513, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_clip": 1.00207186, + "balance_loss_mlp": 1.00086331, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.6879044128962206, + "language_loss": 0.74202853, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76491648, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.589038372039795 + }, + { + "auxiliary_loss_clip": 0.01094759, + "auxiliary_loss_mlp": 0.01130965, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00088334, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.5577429909487224, + "language_loss": 0.71540523, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73766243, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.7736001014709473 + }, + { + "auxiliary_loss_clip": 0.01122944, + "auxiliary_loss_mlp": 0.01131648, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00061274, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 2.7309013608066124, + "language_loss": 0.75159502, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77414095, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 4.35894513130188 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01115267, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00006294, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8131312097219172, + "language_loss": 0.58723146, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60958755, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.176018714904785 + }, + { + "auxiliary_loss_clip": 0.01141423, + "auxiliary_loss_mlp": 0.01130765, + "balance_loss_clip": 1.00221777, + "balance_loss_mlp": 1.00058794, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 6.89486845880977, + "language_loss": 0.79246998, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81519186, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 4.068901777267456 + }, + { + "auxiliary_loss_clip": 0.01155916, + "auxiliary_loss_mlp": 0.01131518, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00086451, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 1.981337243784528, + "language_loss": 0.74021357, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76308787, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.560318946838379 + }, + { + "auxiliary_loss_clip": 0.01113464, + "auxiliary_loss_mlp": 0.01131416, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00076294, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.8413177431234655, + "language_loss": 0.65407169, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67652047, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.744856595993042 + }, + { + "auxiliary_loss_clip": 0.01121013, + "auxiliary_loss_mlp": 0.01131279, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00100756, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.130672713991616, + "language_loss": 0.71935076, + "learning_rate": 2.909906390418006e-06, + "loss": 0.74187362, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.6125166416168213 + }, + { + "auxiliary_loss_clip": 0.01119235, + "auxiliary_loss_mlp": 0.0111453, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00008905, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7441721998805302, + "language_loss": 0.59270632, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.615044, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.2967848777770996 + }, + { + "auxiliary_loss_clip": 0.01154729, + "auxiliary_loss_mlp": 0.01131511, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00076175, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.7107978241540034, + "language_loss": 0.75237966, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77524209, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.547769546508789 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01130581, + "balance_loss_clip": 1.00200236, + "balance_loss_mlp": 1.00097632, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.739466825854658, + "language_loss": 0.76664591, + "learning_rate": 2.908865770392555e-06, + "loss": 0.7895, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.555823564529419 + }, + { + "auxiliary_loss_clip": 0.01156331, + "auxiliary_loss_mlp": 0.01130916, + "balance_loss_clip": 1.00218761, + "balance_loss_mlp": 1.00054836, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.5373109607678015, + "language_loss": 0.81940824, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84228069, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.56491756439209 + }, + { + "auxiliary_loss_clip": 0.01155993, + "auxiliary_loss_mlp": 0.01131023, + "balance_loss_clip": 1.00199962, + "balance_loss_mlp": 1.00075102, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.0607257942073836, + "language_loss": 0.77530199, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79817212, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.5417046546936035 + }, + { + "auxiliary_loss_clip": 0.01156651, + "auxiliary_loss_mlp": 0.0113073, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00074434, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 2.023610281445848, + "language_loss": 0.76663345, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.78950727, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.5247275829315186 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01131668, + "balance_loss_clip": 1.00223827, + "balance_loss_mlp": 1.0009191, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.5756262015764324, + "language_loss": 0.80591702, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82863098, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.568815231323242 + }, + { + "auxiliary_loss_clip": 0.01123655, + "auxiliary_loss_mlp": 0.00748165, + "balance_loss_clip": 1.0018034, + "balance_loss_mlp": 1.00080609, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7717315059680543, + "language_loss": 0.83601344, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85473168, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.676847457885742 + }, + { + "auxiliary_loss_clip": 0.01154912, + "auxiliary_loss_mlp": 0.01131267, + "balance_loss_clip": 1.00216234, + "balance_loss_mlp": 1.00070858, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.0404158730835182, + "language_loss": 0.740605, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76346672, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.5694997310638428 + }, + { + "auxiliary_loss_clip": 0.01171684, + "auxiliary_loss_mlp": 0.01131904, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00077343, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 2.2662428220663493, + "language_loss": 0.71107507, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73411095, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.54875111579895 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01131244, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00078154, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.5996632702743516, + "language_loss": 0.81506121, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83777392, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.567405939102173 + }, + { + "auxiliary_loss_clip": 0.01137624, + "auxiliary_loss_mlp": 0.01114615, + "balance_loss_clip": 1.00232673, + "balance_loss_mlp": 1.0001744, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.7952126436865885, + "language_loss": 0.63132507, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65384746, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.2664549350738525 + }, + { + "auxiliary_loss_clip": 0.01112259, + "auxiliary_loss_mlp": 0.01130283, + "balance_loss_clip": 1.00226712, + "balance_loss_mlp": 1.00096428, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.172911148617672, + "language_loss": 0.7031725, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72559792, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.6706349849700928 + }, + { + "auxiliary_loss_clip": 0.01154825, + "auxiliary_loss_mlp": 0.01131187, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.0007242, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.9136015703745661, + "language_loss": 0.72525781, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74811792, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.5617008209228516 + }, + { + "auxiliary_loss_clip": 0.01139747, + "auxiliary_loss_mlp": 0.01130783, + "balance_loss_clip": 1.00220275, + "balance_loss_mlp": 1.00060654, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8231034202443077, + "language_loss": 0.68237412, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70507944, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.569234609603882 + }, + { + "auxiliary_loss_clip": 0.01156329, + "auxiliary_loss_mlp": 0.01130518, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00053215, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.787812095618452, + "language_loss": 0.67665285, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.69952136, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.539405584335327 + }, + { + "auxiliary_loss_clip": 0.01145465, + "auxiliary_loss_mlp": 0.01130641, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00065506, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7751636687624541, + "language_loss": 0.82260758, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84536862, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.572068691253662 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01131611, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00067174, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.966917700189709, + "language_loss": 0.76198816, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78436559, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.634284257888794 + }, + { + "auxiliary_loss_clip": 0.01171548, + "auxiliary_loss_mlp": 0.01131728, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00069308, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.2612165719499098, + "language_loss": 0.6914354, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71446818, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.4726741313934326 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01130542, + "balance_loss_clip": 1.00174272, + "balance_loss_mlp": 1.00084209, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.6373313436991561, + "language_loss": 0.70778584, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73048198, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.6111953258514404 + }, + { + "auxiliary_loss_clip": 0.01137705, + "auxiliary_loss_mlp": 0.01129942, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00071883, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.520243458754193, + "language_loss": 0.78867787, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.8113544, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 4.054744720458984 + }, + { + "auxiliary_loss_clip": 0.0117147, + "auxiliary_loss_mlp": 0.01130967, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.00059998, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.749342734549407, + "language_loss": 0.79640788, + "learning_rate": 2.902267988534295e-06, + "loss": 0.8194322, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.5269739627838135 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.00748168, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00081944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.773193875456166, + "language_loss": 0.79291928, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81179214, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.5895893573760986 + }, + { + "auxiliary_loss_clip": 0.01154662, + "auxiliary_loss_mlp": 0.01130969, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00050688, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6663354518843958, + "language_loss": 0.68318748, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70604384, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.534759521484375 + }, + { + "auxiliary_loss_clip": 0.01139893, + "auxiliary_loss_mlp": 0.01131353, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00089049, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.0168269894573903, + "language_loss": 0.83358139, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.8562938, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.6274447441101074 + }, + { + "auxiliary_loss_clip": 0.01144314, + "auxiliary_loss_mlp": 0.01131666, + "balance_loss_clip": 1.00221562, + "balance_loss_mlp": 1.00082195, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 2.012267443024628, + "language_loss": 0.69073296, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71349275, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.621751070022583 + }, + { + "auxiliary_loss_clip": 0.01119506, + "auxiliary_loss_mlp": 0.01114683, + "balance_loss_clip": 1.00197184, + "balance_loss_mlp": 1.00024211, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7909678273003754, + "language_loss": 0.57027423, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59261614, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.080972909927368 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01130339, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00082982, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.9596189941271862, + "language_loss": 0.75189072, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77458966, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.5798563957214355 + }, + { + "auxiliary_loss_clip": 0.01156248, + "auxiliary_loss_mlp": 0.00748165, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00087988, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.6291804298693116, + "language_loss": 0.73771465, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75675875, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 3.939113140106201 + }, + { + "auxiliary_loss_clip": 0.01171518, + "auxiliary_loss_mlp": 0.01130605, + "balance_loss_clip": 1.00217724, + "balance_loss_mlp": 1.0007143, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.4936251840115036, + "language_loss": 0.79430354, + "learning_rate": 2.899486274782127e-06, + "loss": 0.8173247, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.5201151371002197 + }, + { + "auxiliary_loss_clip": 0.01154747, + "auxiliary_loss_mlp": 0.01131318, + "balance_loss_clip": 1.00209534, + "balance_loss_mlp": 1.0007602, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.5743890751657132, + "language_loss": 0.76512671, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78798735, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.567483901977539 + }, + { + "auxiliary_loss_clip": 0.01139262, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00058734, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 2.1152560286819697, + "language_loss": 0.80755216, + "learning_rate": 2.898790504994232e-06, + "loss": 0.83024859, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.5556888580322266 + }, + { + "auxiliary_loss_clip": 0.011548, + "auxiliary_loss_mlp": 0.01131311, + "balance_loss_clip": 1.00205207, + "balance_loss_mlp": 1.00084805, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.9481659253967385, + "language_loss": 0.59454387, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61740494, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 4.0288166999816895 + }, + { + "auxiliary_loss_clip": 0.01139135, + "auxiliary_loss_mlp": 0.01131019, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00084221, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 2.0232930593047618, + "language_loss": 0.79997706, + "learning_rate": 2.898094598877435e-06, + "loss": 0.82267857, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.5305862426757812 + }, + { + "auxiliary_loss_clip": 0.01171399, + "auxiliary_loss_mlp": 0.0113044, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00074077, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.848397057124025, + "language_loss": 0.79998994, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82300842, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 4.013415813446045 + }, + { + "auxiliary_loss_clip": 0.01154841, + "auxiliary_loss_mlp": 0.01130855, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00086892, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.625427275779863, + "language_loss": 0.8854214, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90827835, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.635833501815796 + }, + { + "auxiliary_loss_clip": 0.01154953, + "auxiliary_loss_mlp": 0.01130693, + "balance_loss_clip": 1.00213933, + "balance_loss_mlp": 1.00089836, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.668409338576313, + "language_loss": 0.73754239, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.76039886, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.5376150608062744 + }, + { + "auxiliary_loss_clip": 0.01123533, + "auxiliary_loss_mlp": 0.0113105, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00096869, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.9958192973769766, + "language_loss": 0.75534463, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77789044, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.605091094970703 + }, + { + "auxiliary_loss_clip": 0.01090822, + "auxiliary_loss_mlp": 0.01131408, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.0008496, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.5712564394725184, + "language_loss": 0.72052157, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74274385, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.6964337825775146 + }, + { + "auxiliary_loss_clip": 0.011715, + "auxiliary_loss_mlp": 0.01131696, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00085187, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 1.7389957978280897, + "language_loss": 0.70388305, + "learning_rate": 2.896006063609283e-06, + "loss": 0.726915, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.01139568, + "auxiliary_loss_mlp": 0.0113082, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00083447, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.8007826535134965, + "language_loss": 0.77995306, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80265695, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.5824642181396484 + }, + { + "auxiliary_loss_clip": 0.01154903, + "auxiliary_loss_mlp": 0.01130967, + "balance_loss_clip": 1.00207841, + "balance_loss_mlp": 1.00107694, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 2.55913874455526, + "language_loss": 0.79073673, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.81359547, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.542022705078125 + }, + { + "auxiliary_loss_clip": 0.01135124, + "auxiliary_loss_mlp": 0.01114454, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00001359, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7876501415380416, + "language_loss": 0.57488871, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59738445, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.145808219909668 + }, + { + "auxiliary_loss_clip": 0.01156337, + "auxiliary_loss_mlp": 0.00748386, + "balance_loss_clip": 1.00205243, + "balance_loss_mlp": 1.00103998, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.0561288082757883, + "language_loss": 0.77081639, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78986365, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01130896, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00081491, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 5.360237472981173, + "language_loss": 0.71795034, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74050307, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.6239025592803955 + }, + { + "auxiliary_loss_clip": 0.0110867, + "auxiliary_loss_mlp": 0.01130877, + "balance_loss_clip": 1.0019418, + "balance_loss_mlp": 1.00070047, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.477207848013366, + "language_loss": 0.76842612, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79082155, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.672184944152832 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01131826, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00079083, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8479452393884135, + "language_loss": 0.83905435, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.86192095, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.571367025375366 + }, + { + "auxiliary_loss_clip": 0.01156257, + "auxiliary_loss_mlp": 0.01130158, + "balance_loss_clip": 1.00204909, + "balance_loss_mlp": 1.00074482, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.8753443157791312, + "language_loss": 0.84938824, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87225246, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.595297336578369 + }, + { + "auxiliary_loss_clip": 0.01138221, + "auxiliary_loss_mlp": 0.01130391, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00078678, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 4.804300517786753, + "language_loss": 0.65278912, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67547524, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.7541613578796387 + }, + { + "auxiliary_loss_clip": 0.0114069, + "auxiliary_loss_mlp": 0.01131192, + "balance_loss_clip": 1.00199866, + "balance_loss_mlp": 1.00082469, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.7124543605000802, + "language_loss": 0.8401649, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86288369, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.6031653881073 + }, + { + "auxiliary_loss_clip": 0.01140144, + "auxiliary_loss_mlp": 0.01131329, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00077116, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.4864053054551007, + "language_loss": 0.88466209, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90737683, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.543941020965576 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01131496, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00074744, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.5683469211025394, + "language_loss": 0.73836648, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76075947, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.6850650310516357 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01130776, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00069535, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.3326826552386377, + "language_loss": 0.79638326, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81940544, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 2.52095365524292 + }, + { + "auxiliary_loss_clip": 0.0112231, + "auxiliary_loss_mlp": 0.01130153, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00064361, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7000433907172525, + "language_loss": 0.84154022, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86406481, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.6066198348999023 + }, + { + "auxiliary_loss_clip": 0.01138, + "auxiliary_loss_mlp": 0.01130427, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00072718, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.2091008151940024, + "language_loss": 0.76933777, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79202205, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.584977626800537 + }, + { + "auxiliary_loss_clip": 0.01139654, + "auxiliary_loss_mlp": 0.01130226, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.0007174, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.5596389916633577, + "language_loss": 0.79640561, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81910443, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.5718584060668945 + }, + { + "auxiliary_loss_clip": 0.01154746, + "auxiliary_loss_mlp": 0.01130735, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.0008446, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.9088329335628549, + "language_loss": 0.83815461, + "learning_rate": 2.890081914052443e-06, + "loss": 0.86100948, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.510761022567749 + }, + { + "auxiliary_loss_clip": 0.01171342, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00080967, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.4418017349980996, + "language_loss": 0.64609253, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66910815, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.5012688636779785 + }, + { + "auxiliary_loss_clip": 0.01154591, + "auxiliary_loss_mlp": 0.01130459, + "balance_loss_clip": 1.00207686, + "balance_loss_mlp": 1.0009501, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.3647227730074314, + "language_loss": 0.74002731, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76287782, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 3.9925577640533447 + }, + { + "auxiliary_loss_clip": 0.01138051, + "auxiliary_loss_mlp": 0.01130112, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00069869, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.8915705564459893, + "language_loss": 0.80765915, + "learning_rate": 2.889035461484742e-06, + "loss": 0.83034074, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 2.9679715633392334 + }, + { + "auxiliary_loss_clip": 0.01122579, + "auxiliary_loss_mlp": 0.01130472, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00086749, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.747311378415171, + "language_loss": 0.60208386, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62461436, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.7801461219787598 + }, + { + "auxiliary_loss_clip": 0.01154365, + "auxiliary_loss_mlp": 0.01131316, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00085282, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 2.3871183264167555, + "language_loss": 0.73361623, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75647306, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.6556057929992676 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01130463, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.00085855, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 1.7557445483747172, + "language_loss": 0.74069369, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76339275, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.5931196212768555 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01129887, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00085497, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6334015668855935, + "language_loss": 0.81441295, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83708644, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.594407320022583 + }, + { + "auxiliary_loss_clip": 0.01154938, + "auxiliary_loss_mlp": 0.01131023, + "balance_loss_clip": 1.00208461, + "balance_loss_mlp": 1.00084686, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.7943799206679543, + "language_loss": 0.75472212, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77758175, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.585033416748047 + }, + { + "auxiliary_loss_clip": 0.01156192, + "auxiliary_loss_mlp": 0.01130647, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00085211, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.9186652670342819, + "language_loss": 0.78064317, + "learning_rate": 2.886941646474128e-06, + "loss": 0.8035115, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.5120432376861572 + }, + { + "auxiliary_loss_clip": 0.01171485, + "auxiliary_loss_mlp": 0.01130449, + "balance_loss_clip": 1.00211072, + "balance_loss_mlp": 1.00065398, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.0554754234204218, + "language_loss": 0.93453926, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95755863, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 3.881915330886841 + }, + { + "auxiliary_loss_clip": 0.0112892, + "auxiliary_loss_mlp": 0.011302, + "balance_loss_clip": 1.0021162, + "balance_loss_mlp": 1.00069058, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 1.8877573736448454, + "language_loss": 0.83019888, + "learning_rate": 2.886243438932759e-06, + "loss": 0.85279006, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.595031499862671 + }, + { + "auxiliary_loss_clip": 0.01156414, + "auxiliary_loss_mlp": 0.01130451, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00084662, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.275853229506163, + "language_loss": 0.73387522, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75674391, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.5292696952819824 + }, + { + "auxiliary_loss_clip": 0.01121843, + "auxiliary_loss_mlp": 0.01130842, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00085616, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.5744086395104335, + "language_loss": 0.70699275, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72951955, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 4.023401260375977 + }, + { + "auxiliary_loss_clip": 0.01113813, + "auxiliary_loss_mlp": 0.01130892, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00071537, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.6540294272392315, + "language_loss": 0.77291083, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79535788, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.6518185138702393 + }, + { + "auxiliary_loss_clip": 0.01154715, + "auxiliary_loss_mlp": 0.01130555, + "balance_loss_clip": 1.00205028, + "balance_loss_mlp": 1.00066483, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.7375708505957845, + "language_loss": 0.73107409, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75392687, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 4.195120096206665 + }, + { + "auxiliary_loss_clip": 0.01154949, + "auxiliary_loss_mlp": 0.01132133, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.00081205, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 1.8588586770124476, + "language_loss": 0.82068253, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84355336, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.5501439571380615 + }, + { + "auxiliary_loss_clip": 0.01122669, + "auxiliary_loss_mlp": 0.0113118, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00100362, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2423086821555622, + "language_loss": 0.78133178, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.80387026, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.713602066040039 + }, + { + "auxiliary_loss_clip": 0.01139136, + "auxiliary_loss_mlp": 0.0113022, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00090146, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.5737292623918142, + "language_loss": 0.84867436, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87136793, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.733022451400757 + }, + { + "auxiliary_loss_clip": 0.01122227, + "auxiliary_loss_mlp": 0.01130573, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00087285, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.6223112416781706, + "language_loss": 0.68254364, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70507157, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.634901762008667 + }, + { + "auxiliary_loss_clip": 0.01143994, + "auxiliary_loss_mlp": 0.01129669, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00073218, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 1.9252259908917542, + "language_loss": 0.6615355, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68427211, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.576995849609375 + }, + { + "auxiliary_loss_clip": 0.01138065, + "auxiliary_loss_mlp": 0.01130005, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00078201, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.8026225848732702, + "language_loss": 0.80588222, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82856286, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.5504183769226074 + }, + { + "auxiliary_loss_clip": 0.01154641, + "auxiliary_loss_mlp": 0.01130391, + "balance_loss_clip": 1.00206816, + "balance_loss_mlp": 1.00078702, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3546320751804461, + "language_loss": 0.78788245, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81073278, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.585336923599243 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.01129915, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00078726, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 3.080937230899172, + "language_loss": 0.76533991, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.78801179, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.581871747970581 + }, + { + "auxiliary_loss_clip": 0.01123264, + "auxiliary_loss_mlp": 0.01130104, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00069058, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.55770459733335, + "language_loss": 0.82957423, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85210794, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.6108436584472656 + }, + { + "auxiliary_loss_clip": 0.0113927, + "auxiliary_loss_mlp": 0.0113088, + "balance_loss_clip": 1.002033, + "balance_loss_mlp": 1.00098979, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6168414299438092, + "language_loss": 0.76095998, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78366154, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.5715866088867188 + }, + { + "auxiliary_loss_clip": 0.01122531, + "auxiliary_loss_mlp": 0.00748412, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00138783, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.8771988923972747, + "language_loss": 0.70604539, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72475481, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.6503002643585205 + }, + { + "auxiliary_loss_clip": 0.01122238, + "auxiliary_loss_mlp": 0.01130373, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00086439, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.0654615669387164, + "language_loss": 0.68960202, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71212816, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.774606704711914 + }, + { + "auxiliary_loss_clip": 0.01121511, + "auxiliary_loss_mlp": 0.01129921, + "balance_loss_clip": 1.00209296, + "balance_loss_mlp": 1.0007931, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.83022747718644, + "language_loss": 0.70092463, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72343892, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.6469621658325195 + }, + { + "auxiliary_loss_clip": 0.01124744, + "auxiliary_loss_mlp": 0.01130243, + "balance_loss_clip": 1.00225818, + "balance_loss_mlp": 1.00073433, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 3.102012426484997, + "language_loss": 0.79246974, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81501967, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.676309108734131 + }, + { + "auxiliary_loss_clip": 0.01138188, + "auxiliary_loss_mlp": 0.01130173, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00075936, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.8125555591173386, + "language_loss": 0.68112254, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70380616, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.635227918624878 + }, + { + "auxiliary_loss_clip": 0.01124326, + "auxiliary_loss_mlp": 0.0112965, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00061762, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.6975051902201188, + "language_loss": 0.82620096, + "learning_rate": 2.879253987586635e-06, + "loss": 0.84874076, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.6468453407287598 + }, + { + "auxiliary_loss_clip": 0.01122557, + "auxiliary_loss_mlp": 0.01129731, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00079417, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.5405969914388817, + "language_loss": 0.74573952, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76826239, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.6358447074890137 + }, + { + "auxiliary_loss_clip": 0.01123282, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_clip": 1.00203311, + "balance_loss_mlp": 1.00077772, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 2.890459861320793, + "language_loss": 0.83427781, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.8568145, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.609685182571411 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.01130501, + "balance_loss_clip": 1.00203121, + "balance_loss_mlp": 1.00089717, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.810248232608943, + "language_loss": 0.73697859, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75982904, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.6527345180511475 + }, + { + "auxiliary_loss_clip": 0.01155192, + "auxiliary_loss_mlp": 0.01130476, + "balance_loss_clip": 1.00230956, + "balance_loss_mlp": 1.00096726, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.16565986850524, + "language_loss": 0.73457992, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75743657, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.5182533264160156 + }, + { + "auxiliary_loss_clip": 0.01139319, + "auxiliary_loss_mlp": 0.01129566, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00053418, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.8920341499120066, + "language_loss": 0.76996219, + "learning_rate": 2.877504536769561e-06, + "loss": 0.792651, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.638862371444702 + }, + { + "auxiliary_loss_clip": 0.011393, + "auxiliary_loss_mlp": 0.01129887, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00085449, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.7637815735742768, + "language_loss": 0.69505131, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71774316, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.5678343772888184 + }, + { + "auxiliary_loss_clip": 0.01155661, + "auxiliary_loss_mlp": 0.01130611, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00081611, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 2.4746728366305355, + "language_loss": 0.82344186, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84630454, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.5255720615386963 + }, + { + "auxiliary_loss_clip": 0.01171341, + "auxiliary_loss_mlp": 0.01130316, + "balance_loss_clip": 1.00211561, + "balance_loss_mlp": 1.00080705, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.9025404136680195, + "language_loss": 0.77838492, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80140144, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.513472318649292 + }, + { + "auxiliary_loss_clip": 0.01156371, + "auxiliary_loss_mlp": 0.01130872, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.00079048, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 1.9084060168148382, + "language_loss": 0.73179436, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75466675, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.584977626800537 + }, + { + "auxiliary_loss_clip": 0.01145787, + "auxiliary_loss_mlp": 0.0074832, + "balance_loss_clip": 1.00232935, + "balance_loss_mlp": 1.00128484, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 1.999458354939988, + "language_loss": 0.93292654, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95186758, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 4.084667205810547 + }, + { + "auxiliary_loss_clip": 0.01171387, + "auxiliary_loss_mlp": 0.01129925, + "balance_loss_clip": 1.00211763, + "balance_loss_mlp": 1.00079799, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 2.1645088853154766, + "language_loss": 0.70776832, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73078144, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.47413969039917 + }, + { + "auxiliary_loss_clip": 0.01076097, + "auxiliary_loss_mlp": 0.01130513, + "balance_loss_clip": 1.0016948, + "balance_loss_mlp": 1.0008136, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.830176342349202, + "language_loss": 0.65302378, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67508984, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.8667845726013184 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.00748224, + "balance_loss_clip": 1.00194657, + "balance_loss_mlp": 1.00118351, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0263519672795596, + "language_loss": 0.76418114, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.78289229, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.5905990600585938 + }, + { + "auxiliary_loss_clip": 0.01122604, + "auxiliary_loss_mlp": 0.01130224, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00071454, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 1.950488056593979, + "language_loss": 0.83591896, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85844725, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.7268476486206055 + }, + { + "auxiliary_loss_clip": 0.01138145, + "auxiliary_loss_mlp": 0.01130558, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00095379, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.2453931408987233, + "language_loss": 0.68374205, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70642912, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.6679811477661133 + }, + { + "auxiliary_loss_clip": 0.01077882, + "auxiliary_loss_mlp": 0.00748355, + "balance_loss_clip": 1.00168478, + "balance_loss_mlp": 1.00116384, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7578650039350916, + "language_loss": 0.83816642, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85642874, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 4.146154403686523 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00080132, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 3.0534875996707935, + "language_loss": 0.83653355, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85890722, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.641803741455078 + }, + { + "auxiliary_loss_clip": 0.01141154, + "auxiliary_loss_mlp": 0.01130785, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00079954, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 2.6483862446073454, + "language_loss": 0.64163893, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66435838, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.5755789279937744 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01130805, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00101042, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7343941156222964, + "language_loss": 0.74500871, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.76771683, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.6022157669067383 + }, + { + "auxiliary_loss_clip": 0.01154606, + "auxiliary_loss_mlp": 0.01130468, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00067353, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 4.612288441337181, + "language_loss": 0.54959202, + "learning_rate": 2.872251199697598e-06, + "loss": 0.57244277, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 3.9605274200439453 + }, + { + "auxiliary_loss_clip": 0.0115607, + "auxiliary_loss_mlp": 0.01130381, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.0009675, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 1.8737278679684666, + "language_loss": 0.84198862, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86485314, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.589614152908325 + }, + { + "auxiliary_loss_clip": 0.01138083, + "auxiliary_loss_mlp": 0.01130425, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00072551, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.5740790835847884, + "language_loss": 0.6841017, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70678669, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 4.266280174255371 + }, + { + "auxiliary_loss_clip": 0.01138899, + "auxiliary_loss_mlp": 0.01130109, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00088596, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.316339585138408, + "language_loss": 0.77296418, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79565424, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.589690923690796 + }, + { + "auxiliary_loss_clip": 0.01154833, + "auxiliary_loss_mlp": 0.01130826, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.00084066, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 1.9993705307140552, + "language_loss": 0.58000696, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60286355, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.6463632583618164 + }, + { + "auxiliary_loss_clip": 0.01138034, + "auxiliary_loss_mlp": 0.01131125, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00094843, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 1.9101136095116789, + "language_loss": 0.89768076, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92037237, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.5919551849365234 + }, + { + "auxiliary_loss_clip": 0.01120997, + "auxiliary_loss_mlp": 0.01129531, + "balance_loss_clip": 1.00194466, + "balance_loss_mlp": 1.00069022, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.7158822349296792, + "language_loss": 0.76525956, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78776485, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.700725555419922 + }, + { + "auxiliary_loss_clip": 0.01124184, + "auxiliary_loss_mlp": 0.01131684, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00103092, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 1.8038659084124826, + "language_loss": 0.61683011, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6393888, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.627185583114624 + }, + { + "auxiliary_loss_clip": 0.01154648, + "auxiliary_loss_mlp": 0.01130838, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00075674, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.606324649209623, + "language_loss": 0.73914015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76199508, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.01156195, + "auxiliary_loss_mlp": 0.01130447, + "balance_loss_clip": 1.0021553, + "balance_loss_mlp": 1.00074768, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.7322981384769351, + "language_loss": 0.70435536, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72722179, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.5547852516174316 + }, + { + "auxiliary_loss_clip": 0.01139598, + "auxiliary_loss_mlp": 0.0113041, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00071037, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 4.05692250393867, + "language_loss": 0.84700954, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86970973, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.568988800048828 + }, + { + "auxiliary_loss_clip": 0.01107769, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00106573, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.4794293497702808, + "language_loss": 0.80959499, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83197457, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.67997145652771 + }, + { + "auxiliary_loss_clip": 0.01107323, + "auxiliary_loss_mlp": 0.01131279, + "balance_loss_clip": 1.00176919, + "balance_loss_mlp": 1.00091171, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.9082764701803514, + "language_loss": 0.71292293, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73530895, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.6997973918914795 + }, + { + "auxiliary_loss_clip": 0.01139144, + "auxiliary_loss_mlp": 0.01130981, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00089979, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.5474021010956616, + "language_loss": 0.78212714, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80482835, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.586731433868408 + }, + { + "auxiliary_loss_clip": 0.01139426, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00097561, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7069583660255097, + "language_loss": 0.79917741, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82188034, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.704927444458008 + }, + { + "auxiliary_loss_clip": 0.01156491, + "auxiliary_loss_mlp": 0.01130208, + "balance_loss_clip": 1.00219572, + "balance_loss_mlp": 1.00088966, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7847511958388913, + "language_loss": 0.80619323, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82906026, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.6913716793060303 + }, + { + "auxiliary_loss_clip": 0.01171617, + "auxiliary_loss_mlp": 0.01130907, + "balance_loss_clip": 1.0023092, + "balance_loss_mlp": 1.00101662, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.123668981874002, + "language_loss": 0.79893374, + "learning_rate": 2.866639438447501e-06, + "loss": 0.8219589, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.4801368713378906 + }, + { + "auxiliary_loss_clip": 0.01171213, + "auxiliary_loss_mlp": 0.01130301, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00107813, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.548597346931804, + "language_loss": 0.73325944, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75627458, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.5200703144073486 + }, + { + "auxiliary_loss_clip": 0.01154643, + "auxiliary_loss_mlp": 0.0112981, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00106382, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.834321802348812, + "language_loss": 0.6906184, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71346283, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.611001491546631 + }, + { + "auxiliary_loss_clip": 0.01154806, + "auxiliary_loss_mlp": 0.0113138, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00082207, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 3.0847399745356334, + "language_loss": 0.63096684, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65382874, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.584970235824585 + }, + { + "auxiliary_loss_clip": 0.01153548, + "auxiliary_loss_mlp": 0.01113134, + "balance_loss_clip": 1.00220585, + "balance_loss_mlp": 1.00021875, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7385218600409226, + "language_loss": 0.58890092, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.61156774, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.2548210620880127 + }, + { + "auxiliary_loss_clip": 0.01171369, + "auxiliary_loss_mlp": 0.0113044, + "balance_loss_clip": 1.00211203, + "balance_loss_mlp": 1.00093055, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4186865114316307, + "language_loss": 0.65154076, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67455888, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.672877788543701 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01130164, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.0009414, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.7158074640684984, + "language_loss": 0.70696414, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72947752, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.6645936965942383 + }, + { + "auxiliary_loss_clip": 0.01168711, + "auxiliary_loss_mlp": 0.01113077, + "balance_loss_clip": 1.00229812, + "balance_loss_mlp": 1.00016236, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7030768705196413, + "language_loss": 0.56079435, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.5836122, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.0964465141296387 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01129863, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00083113, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.7027450780433968, + "language_loss": 0.80145305, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82429945, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.5611519813537598 + }, + { + "auxiliary_loss_clip": 0.0115581, + "auxiliary_loss_mlp": 0.0112966, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.0007236, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.6967518347941246, + "language_loss": 0.74196792, + "learning_rate": 2.863479122159103e-06, + "loss": 0.7648226, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.553372621536255 + }, + { + "auxiliary_loss_clip": 0.011549, + "auxiliary_loss_mlp": 0.01130426, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00101233, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4467283370053068, + "language_loss": 0.72083145, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74368465, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.5108606815338135 + }, + { + "auxiliary_loss_clip": 0.01139803, + "auxiliary_loss_mlp": 0.01130336, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00111282, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.6967261068606667, + "language_loss": 0.83970332, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.8624047, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 4.066116809844971 + }, + { + "auxiliary_loss_clip": 0.01106226, + "auxiliary_loss_mlp": 0.0112937, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00091052, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4998063367583385, + "language_loss": 0.75693047, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77928638, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.792433261871338 + }, + { + "auxiliary_loss_clip": 0.01140871, + "auxiliary_loss_mlp": 0.01130641, + "balance_loss_clip": 1.00210702, + "balance_loss_mlp": 1.00094151, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.8645463291271573, + "language_loss": 0.85448992, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87720501, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.5882279872894287 + }, + { + "auxiliary_loss_clip": 0.01155374, + "auxiliary_loss_mlp": 0.01129398, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00065231, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.5185000347438167, + "language_loss": 0.78263527, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80548292, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.5430943965911865 + }, + { + "auxiliary_loss_clip": 0.01139976, + "auxiliary_loss_mlp": 0.01131096, + "balance_loss_clip": 1.00232816, + "balance_loss_mlp": 1.00082374, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.6165068487660652, + "language_loss": 0.83100474, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85371548, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.6099448204040527 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01129628, + "balance_loss_clip": 1.00189102, + "balance_loss_mlp": 1.00078619, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 1.9028404855138605, + "language_loss": 0.7514298, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77411127, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.01130052, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.00082898, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.404008682256379, + "language_loss": 0.7624501, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78546238, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.5121636390686035 + }, + { + "auxiliary_loss_clip": 0.01139651, + "auxiliary_loss_mlp": 0.01129759, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.00082266, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 5.408601543529408, + "language_loss": 0.84334463, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86603868, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 4.022059202194214 + }, + { + "auxiliary_loss_clip": 0.01155666, + "auxiliary_loss_mlp": 0.01129622, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00068498, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.7899269234744055, + "language_loss": 0.69380808, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71666098, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.6482748985290527 + }, + { + "auxiliary_loss_clip": 0.010948, + "auxiliary_loss_mlp": 0.01130219, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00080585, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.6796526704497683, + "language_loss": 0.76640642, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78865659, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.727412700653076 + }, + { + "auxiliary_loss_clip": 0.01171388, + "auxiliary_loss_mlp": 0.01130557, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00085676, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.190746994896129, + "language_loss": 0.86040407, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88342351, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.479907751083374 + }, + { + "auxiliary_loss_clip": 0.01137782, + "auxiliary_loss_mlp": 0.01129987, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00085962, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.7256857294614822, + "language_loss": 0.84262717, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86530483, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 4.0881187915802 + }, + { + "auxiliary_loss_clip": 0.01155115, + "auxiliary_loss_mlp": 0.01130886, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.0009954, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.006754617495508, + "language_loss": 0.82490629, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84776628, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.527233600616455 + }, + { + "auxiliary_loss_clip": 0.01156162, + "auxiliary_loss_mlp": 0.0113018, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00067091, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.1662247487017567, + "language_loss": 0.73118627, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75404966, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 4.015877962112427 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01129538, + "balance_loss_clip": 1.00217879, + "balance_loss_mlp": 1.00069642, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.8917233817985866, + "language_loss": 0.7502889, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77313232, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.599461078643799 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.01129714, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00087261, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9280231052863432, + "language_loss": 0.73524398, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75808656, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.5650224685668945 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.0113044, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00083518, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 1.98266210617542, + "language_loss": 0.79502869, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81755865, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.5982983112335205 + }, + { + "auxiliary_loss_clip": 0.01120634, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00055039, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.9937470027219721, + "language_loss": 0.76339704, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78590775, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.6170618534088135 + }, + { + "auxiliary_loss_clip": 0.01156349, + "auxiliary_loss_mlp": 0.01130612, + "balance_loss_clip": 1.00204694, + "balance_loss_mlp": 1.00110328, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.9049815331685032, + "language_loss": 0.69468164, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71755123, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.515357255935669 + }, + { + "auxiliary_loss_clip": 0.01171146, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00085306, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.9629438039204674, + "language_loss": 0.7148301, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73784137, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.522427797317505 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01130429, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00082445, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.102066635102349, + "language_loss": 0.82573992, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84844154, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.602548360824585 + }, + { + "auxiliary_loss_clip": 0.01156355, + "auxiliary_loss_mlp": 0.01130154, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00093126, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.9076191340233157, + "language_loss": 0.71726882, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.740134, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.5810976028442383 + }, + { + "auxiliary_loss_clip": 0.01171319, + "auxiliary_loss_mlp": 0.0112985, + "balance_loss_clip": 1.00215757, + "balance_loss_mlp": 1.00091374, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.8208411797157924, + "language_loss": 0.77149761, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79450923, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.4823927879333496 + }, + { + "auxiliary_loss_clip": 0.01139587, + "auxiliary_loss_mlp": 0.01130114, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00070035, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.8042215903012995, + "language_loss": 0.79187524, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81457227, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.5470831394195557 + }, + { + "auxiliary_loss_clip": 0.01109275, + "auxiliary_loss_mlp": 0.01130042, + "balance_loss_clip": 1.00196636, + "balance_loss_mlp": 1.00110483, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.4793076761174926, + "language_loss": 0.84099686, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86339003, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.655601978302002 + }, + { + "auxiliary_loss_clip": 0.01123988, + "auxiliary_loss_mlp": 0.01129221, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00057054, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.027704251333523, + "language_loss": 0.76638412, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78891617, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.6055259704589844 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01130933, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00075662, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.07162944945771, + "language_loss": 0.82631266, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84901786, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.5547821521759033 + }, + { + "auxiliary_loss_clip": 0.01154534, + "auxiliary_loss_mlp": 0.01130158, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00074482, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.891682312646913, + "language_loss": 0.67622757, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69907457, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.5823862552642822 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01129351, + "balance_loss_clip": 1.00201213, + "balance_loss_mlp": 1.00089073, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 2.3546171226640853, + "language_loss": 0.68433791, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70671856, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.704580307006836 + }, + { + "auxiliary_loss_clip": 0.01171305, + "auxiliary_loss_mlp": 0.01129852, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00081956, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 2.174337079094172, + "language_loss": 0.77855784, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80156946, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.5358285903930664 + }, + { + "auxiliary_loss_clip": 0.01171517, + "auxiliary_loss_mlp": 0.01131129, + "balance_loss_clip": 1.00221252, + "balance_loss_mlp": 1.00095248, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.380223780142805, + "language_loss": 0.80186582, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82489228, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.4927260875701904 + }, + { + "auxiliary_loss_clip": 0.01153139, + "auxiliary_loss_mlp": 0.01112344, + "balance_loss_clip": 1.00234854, + "balance_loss_mlp": 1.00019217, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9740589754450648, + "language_loss": 0.64536011, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66801488, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 3.032146692276001 + }, + { + "auxiliary_loss_clip": 0.01139165, + "auxiliary_loss_mlp": 0.01130435, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.0012126, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.7441473909866834, + "language_loss": 0.73528117, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75797713, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.672995090484619 + }, + { + "auxiliary_loss_clip": 0.01139624, + "auxiliary_loss_mlp": 0.01130692, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00089669, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.4197267949329946, + "language_loss": 0.78306496, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80576813, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.6095130443573 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01130037, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00081456, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.585045620128887, + "language_loss": 0.73164392, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75417435, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.6868274211883545 + }, + { + "auxiliary_loss_clip": 0.01108775, + "auxiliary_loss_mlp": 0.01130183, + "balance_loss_clip": 1.00210226, + "balance_loss_mlp": 1.00086462, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 2.2868911135672314, + "language_loss": 0.78752416, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80991375, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.6774990558624268 + }, + { + "auxiliary_loss_clip": 0.01155634, + "auxiliary_loss_mlp": 0.00748262, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00121474, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.703764482483733, + "language_loss": 0.76685113, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7858901, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.557537317276001 + }, + { + "auxiliary_loss_clip": 0.01144233, + "auxiliary_loss_mlp": 0.01129923, + "balance_loss_clip": 1.00237203, + "balance_loss_mlp": 1.00089085, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4708262966929184, + "language_loss": 0.70696795, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72970951, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.5812315940856934 + }, + { + "auxiliary_loss_clip": 0.01119266, + "auxiliary_loss_mlp": 0.0111228, + "balance_loss_clip": 1.00234199, + "balance_loss_mlp": 1.00012827, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7766264616759623, + "language_loss": 0.56101108, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58332658, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 4.655456781387329 + }, + { + "auxiliary_loss_clip": 0.01122897, + "auxiliary_loss_mlp": 0.01129955, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00082743, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.66554263153798, + "language_loss": 0.7142843, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73681283, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.7140984535217285 + }, + { + "auxiliary_loss_clip": 0.0115448, + "auxiliary_loss_mlp": 0.0112968, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00083804, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7236409551139162, + "language_loss": 0.73309577, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75593734, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.53287935256958 + }, + { + "auxiliary_loss_clip": 0.01160595, + "auxiliary_loss_mlp": 0.01129904, + "balance_loss_clip": 1.00213993, + "balance_loss_mlp": 1.00087154, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0964836598977956, + "language_loss": 0.7146014, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73750639, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.6837074756622314 + }, + { + "auxiliary_loss_clip": 0.01121151, + "auxiliary_loss_mlp": 0.01129397, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00074673, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 2.011785046459073, + "language_loss": 0.65570182, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67820728, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.724590301513672 + }, + { + "auxiliary_loss_clip": 0.01154578, + "auxiliary_loss_mlp": 0.01129542, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00070119, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0623438670546888, + "language_loss": 0.84918463, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87202585, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.5372841358184814 + }, + { + "auxiliary_loss_clip": 0.01139219, + "auxiliary_loss_mlp": 0.01130781, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.0009855, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.0115505463552394, + "language_loss": 0.76345003, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.7861501, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.5582046508789062 + }, + { + "auxiliary_loss_clip": 0.01171304, + "auxiliary_loss_mlp": 0.01130178, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00086021, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6321601150983642, + "language_loss": 0.6423825, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66539729, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 3.9609768390655518 + }, + { + "auxiliary_loss_clip": 0.01108153, + "auxiliary_loss_mlp": 0.01129557, + "balance_loss_clip": 1.00187552, + "balance_loss_mlp": 1.00090635, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.7038624530959443, + "language_loss": 0.71379471, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73617184, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.809704303741455 + }, + { + "auxiliary_loss_clip": 0.01124005, + "auxiliary_loss_mlp": 0.01129954, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00054061, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 2.0140784924760573, + "language_loss": 0.74593616, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76847577, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.6736252307891846 + }, + { + "auxiliary_loss_clip": 0.01156107, + "auxiliary_loss_mlp": 0.01129404, + "balance_loss_clip": 1.00206625, + "balance_loss_mlp": 1.00084829, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 1.985447705914539, + "language_loss": 0.85161966, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87447476, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 3.93609881401062 + }, + { + "auxiliary_loss_clip": 0.01140218, + "auxiliary_loss_mlp": 0.01130089, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00086629, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.4670184465570792, + "language_loss": 0.73104495, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75374806, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.6033992767333984 + }, + { + "auxiliary_loss_clip": 0.01122762, + "auxiliary_loss_mlp": 0.01129726, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00098002, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.8782756393549362, + "language_loss": 0.84447491, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86699986, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.6264522075653076 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_clip": 1.00210214, + "balance_loss_mlp": 1.00078225, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.8932638811149984, + "language_loss": 0.79431742, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81699258, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 3.9493227005004883 + }, + { + "auxiliary_loss_clip": 0.01154497, + "auxiliary_loss_mlp": 0.0112935, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00098526, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.766888205688568, + "language_loss": 0.72819149, + "learning_rate": 2.844461868547842e-06, + "loss": 0.75102997, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.6778030395507812 + }, + { + "auxiliary_loss_clip": 0.01171238, + "auxiliary_loss_mlp": 0.00748134, + "balance_loss_clip": 1.00214207, + "balance_loss_mlp": 1.00100231, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.4817552710619768, + "language_loss": 0.83102024, + "learning_rate": 2.844108810081459e-06, + "loss": 0.850214, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.5157582759857178 + }, + { + "auxiliary_loss_clip": 0.01154523, + "auxiliary_loss_mlp": 0.01129222, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00085711, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.2999040293633388, + "language_loss": 0.61261249, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63544995, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.5648081302642822 + }, + { + "auxiliary_loss_clip": 0.01145494, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.0009706, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.8731530406959866, + "language_loss": 0.55796695, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58072376, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.616652011871338 + }, + { + "auxiliary_loss_clip": 0.01121497, + "auxiliary_loss_mlp": 0.01129695, + "balance_loss_clip": 1.00221872, + "balance_loss_mlp": 1.00104403, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.8828970074322913, + "language_loss": 0.65886325, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68137515, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.743994951248169 + }, + { + "auxiliary_loss_clip": 0.0115497, + "auxiliary_loss_mlp": 0.01130707, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00119758, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.5788756294574107, + "language_loss": 0.75543547, + "learning_rate": 2.842696256262919e-06, + "loss": 0.7782923, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.5320470333099365 + }, + { + "auxiliary_loss_clip": 0.01095785, + "auxiliary_loss_mlp": 0.00748463, + "balance_loss_clip": 1.00206172, + "balance_loss_mlp": 1.00126839, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.0971776590848776, + "language_loss": 0.82020676, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83864927, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.7844269275665283 + }, + { + "auxiliary_loss_clip": 0.01154347, + "auxiliary_loss_mlp": 0.01129212, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00065649, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.537756794677745, + "language_loss": 0.86231691, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88515258, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 3.150724411010742 + }, + { + "auxiliary_loss_clip": 0.01155698, + "auxiliary_loss_mlp": 0.01129687, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.0009408, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.752277192953886, + "language_loss": 0.79079044, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81364429, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.585681200027466 + }, + { + "auxiliary_loss_clip": 0.01154339, + "auxiliary_loss_mlp": 0.01129944, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00072145, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.9601130508430915, + "language_loss": 0.72905421, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75189704, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.5360288619995117 + }, + { + "auxiliary_loss_clip": 0.01154435, + "auxiliary_loss_mlp": 0.01129235, + "balance_loss_clip": 1.00201952, + "balance_loss_mlp": 1.00067949, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.878920724537718, + "language_loss": 0.69135571, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71419233, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.538801908493042 + }, + { + "auxiliary_loss_clip": 0.01141008, + "auxiliary_loss_mlp": 0.01129688, + "balance_loss_clip": 1.00208628, + "balance_loss_mlp": 1.00084662, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.7288017007658028, + "language_loss": 0.63788891, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.66059589, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.653782367706299 + }, + { + "auxiliary_loss_clip": 0.01141234, + "auxiliary_loss_mlp": 0.01129785, + "balance_loss_clip": 1.00206399, + "balance_loss_mlp": 1.00075245, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.9141384775649695, + "language_loss": 0.6941998, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71691, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.563807249069214 + }, + { + "auxiliary_loss_clip": 0.0113947, + "auxiliary_loss_mlp": 0.01130195, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00097251, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 1.9268514148850129, + "language_loss": 0.69121337, + "learning_rate": 2.839869615637177e-06, + "loss": 0.71390998, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.589414119720459 + }, + { + "auxiliary_loss_clip": 0.01120815, + "auxiliary_loss_mlp": 0.01129763, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00073051, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 1.7344473706965557, + "language_loss": 0.89420152, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91670728, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.599008321762085 + }, + { + "auxiliary_loss_clip": 0.01154592, + "auxiliary_loss_mlp": 0.01130067, + "balance_loss_clip": 1.0020442, + "balance_loss_mlp": 1.00093973, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.4798943550952215, + "language_loss": 0.74537224, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76821887, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.530552625656128 + }, + { + "auxiliary_loss_clip": 0.01137662, + "auxiliary_loss_mlp": 0.01129825, + "balance_loss_clip": 1.00188231, + "balance_loss_mlp": 1.00069737, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 2.1985826983462444, + "language_loss": 0.83417714, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85685199, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.592952013015747 + }, + { + "auxiliary_loss_clip": 0.01097029, + "auxiliary_loss_mlp": 0.01130067, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.00093925, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5354749326723947, + "language_loss": 0.77162915, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79390013, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.7044317722320557 + }, + { + "auxiliary_loss_clip": 0.01123247, + "auxiliary_loss_mlp": 0.01130267, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00075841, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.570072092629617, + "language_loss": 0.73114765, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75368279, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.641308069229126 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.00748332, + "balance_loss_clip": 1.00183809, + "balance_loss_mlp": 1.00124359, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.6889882769859095, + "language_loss": 0.69977766, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71848559, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.607628345489502 + }, + { + "auxiliary_loss_clip": 0.01156334, + "auxiliary_loss_mlp": 0.01129675, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00073767, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.65417551151337, + "language_loss": 0.75310349, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77596354, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.5502004623413086 + }, + { + "auxiliary_loss_clip": 0.0115446, + "auxiliary_loss_mlp": 0.01130083, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00085998, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.6350500423214258, + "language_loss": 0.74299926, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.7658447, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.5771985054016113 + }, + { + "auxiliary_loss_clip": 0.0113798, + "auxiliary_loss_mlp": 0.01129918, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00069571, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 1.9722429672877215, + "language_loss": 0.87261868, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89529765, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.602926015853882 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.01130134, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.00091124, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 2.3760035072941923, + "language_loss": 0.76556551, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78841233, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.5658557415008545 + }, + { + "auxiliary_loss_clip": 0.01123791, + "auxiliary_loss_mlp": 0.0112978, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00065291, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.4272803994997596, + "language_loss": 0.76199055, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78452635, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 4.13640284538269 + }, + { + "auxiliary_loss_clip": 0.01155933, + "auxiliary_loss_mlp": 0.01130326, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00091267, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.8714765807075744, + "language_loss": 0.73915887, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76202148, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.711136817932129 + }, + { + "auxiliary_loss_clip": 0.01123391, + "auxiliary_loss_mlp": 0.01129157, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00069737, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.6125881098290944, + "language_loss": 0.64110661, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66363209, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.700892448425293 + }, + { + "auxiliary_loss_clip": 0.01171276, + "auxiliary_loss_mlp": 0.01129473, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00091779, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.738246060848782, + "language_loss": 0.83294976, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85595721, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.561903953552246 + }, + { + "auxiliary_loss_clip": 0.01171227, + "auxiliary_loss_mlp": 0.01129471, + "balance_loss_clip": 1.00213063, + "balance_loss_mlp": 1.0008204, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 2.1498501803205703, + "language_loss": 0.80745375, + "learning_rate": 2.834564176091943e-06, + "loss": 0.83046073, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.5589780807495117 + }, + { + "auxiliary_loss_clip": 0.01123988, + "auxiliary_loss_mlp": 0.01130132, + "balance_loss_clip": 1.00206602, + "balance_loss_mlp": 1.00090957, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.9834256221102946, + "language_loss": 0.75122315, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77376437, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.680633544921875 + }, + { + "auxiliary_loss_clip": 0.01155553, + "auxiliary_loss_mlp": 0.00748459, + "balance_loss_clip": 1.00213277, + "balance_loss_mlp": 1.00131845, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8287787680123524, + "language_loss": 0.81319112, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83223122, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.6248104572296143 + }, + { + "auxiliary_loss_clip": 0.01137434, + "auxiliary_loss_mlp": 0.01130817, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.0010215, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.5425202851420425, + "language_loss": 0.77537394, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79805642, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 4.054352283477783 + }, + { + "auxiliary_loss_clip": 0.0113895, + "auxiliary_loss_mlp": 0.01130414, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00100052, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.109791481136925, + "language_loss": 0.78619528, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80888891, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.5848441123962402 + }, + { + "auxiliary_loss_clip": 0.01091985, + "auxiliary_loss_mlp": 0.01130146, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00082803, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.5465658713412107, + "language_loss": 0.69408101, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71630234, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 3.030301570892334 + }, + { + "auxiliary_loss_clip": 0.01139584, + "auxiliary_loss_mlp": 0.01130716, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.00082588, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.739039777866482, + "language_loss": 0.79049647, + "learning_rate": 2.83244000399261e-06, + "loss": 0.8131994, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 4.1383056640625 + }, + { + "auxiliary_loss_clip": 0.01139651, + "auxiliary_loss_mlp": 0.01129295, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00092995, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.420336397387113, + "language_loss": 0.65603137, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67872083, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.8123061656951904 + }, + { + "auxiliary_loss_clip": 0.01171139, + "auxiliary_loss_mlp": 0.01130098, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00068474, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 2.0917661869725093, + "language_loss": 0.81805122, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84106356, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.508718252182007 + }, + { + "auxiliary_loss_clip": 0.01105707, + "auxiliary_loss_mlp": 0.01130653, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00114417, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.6231663439116681, + "language_loss": 0.58646607, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60882968, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 4.246844053268433 + }, + { + "auxiliary_loss_clip": 0.01140647, + "auxiliary_loss_mlp": 0.01129591, + "balance_loss_clip": 1.00208008, + "balance_loss_mlp": 1.00084531, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 1.9456303813937073, + "language_loss": 0.69048494, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71318734, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.619230270385742 + }, + { + "auxiliary_loss_clip": 0.01154952, + "auxiliary_loss_mlp": 0.0113028, + "balance_loss_clip": 1.00213087, + "balance_loss_mlp": 1.00077152, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 1.7386204585493452, + "language_loss": 0.73238546, + "learning_rate": 2.830668992382758e-06, + "loss": 0.7552377, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.547365188598633 + }, + { + "auxiliary_loss_clip": 0.01138896, + "auxiliary_loss_mlp": 0.01130315, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00080585, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.4181073322218833, + "language_loss": 0.68556762, + "learning_rate": 2.830314695509902e-06, + "loss": 0.7082597, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.6241981983184814 + }, + { + "auxiliary_loss_clip": 0.01155952, + "auxiliary_loss_mlp": 0.0112959, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00084364, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 1.8362188175425211, + "language_loss": 0.64259326, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66544867, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.575106382369995 + }, + { + "auxiliary_loss_clip": 0.01171087, + "auxiliary_loss_mlp": 0.01129947, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00101042, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.3016890645071464, + "language_loss": 0.6812014, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70421177, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.549443244934082 + }, + { + "auxiliary_loss_clip": 0.01107143, + "auxiliary_loss_mlp": 0.01130083, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00085974, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.680550479549858, + "language_loss": 0.78723443, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80960667, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.649496078491211 + }, + { + "auxiliary_loss_clip": 0.01155872, + "auxiliary_loss_mlp": 0.01130169, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00085032, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.409563969213117, + "language_loss": 0.64678532, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66964567, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.603757619857788 + }, + { + "auxiliary_loss_clip": 0.01123185, + "auxiliary_loss_mlp": 0.01130451, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.0007515, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.6760798989521246, + "language_loss": 0.72642863, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.74896502, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 2.646576166152954 + }, + { + "auxiliary_loss_clip": 0.01154301, + "auxiliary_loss_mlp": 0.01129849, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.0008173, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5771580360666022, + "language_loss": 0.84840107, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87124264, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.5551767349243164 + }, + { + "auxiliary_loss_clip": 0.01107501, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00103331, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 1.975861525383909, + "language_loss": 0.75239956, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.7747829, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.7831225395202637 + }, + { + "auxiliary_loss_clip": 0.01160661, + "auxiliary_loss_mlp": 0.01130475, + "balance_loss_clip": 1.00226009, + "balance_loss_mlp": 1.00096631, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.054866225339301, + "language_loss": 0.76016021, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.78307152, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.5381319522857666 + }, + { + "auxiliary_loss_clip": 0.01153966, + "auxiliary_loss_mlp": 0.01129433, + "balance_loss_clip": 1.00189936, + "balance_loss_mlp": 1.00068736, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 4.509216661001915, + "language_loss": 0.72672284, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74955678, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.4985191822052 + }, + { + "auxiliary_loss_clip": 0.01154418, + "auxiliary_loss_mlp": 0.01130009, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00078607, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.59902939295596, + "language_loss": 0.68138826, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70423257, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.6390461921691895 + }, + { + "auxiliary_loss_clip": 0.01122908, + "auxiliary_loss_mlp": 0.01130336, + "balance_loss_clip": 1.0018729, + "balance_loss_mlp": 1.00082731, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1.6761022803645333, + "language_loss": 0.7312668, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75379926, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.6187338829040527 + }, + { + "auxiliary_loss_clip": 0.01110876, + "auxiliary_loss_mlp": 0.01130248, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00093019, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.9241707895453029, + "language_loss": 0.68919158, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71160281, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.01154486, + "auxiliary_loss_mlp": 0.01129715, + "balance_loss_clip": 1.00209677, + "balance_loss_mlp": 1.00096917, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.757346263067814, + "language_loss": 0.83386475, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85670674, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.504293918609619 + }, + { + "auxiliary_loss_clip": 0.01171068, + "auxiliary_loss_mlp": 0.01129382, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00073171, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4128484061423838, + "language_loss": 0.8125695, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83557403, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.5071589946746826 + }, + { + "auxiliary_loss_clip": 0.01168544, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_clip": 1.00244212, + "balance_loss_mlp": 1.00026083, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.8004294651806034, + "language_loss": 0.6044656, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62727523, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.091383218765259 + }, + { + "auxiliary_loss_clip": 0.01171061, + "auxiliary_loss_mlp": 0.01129965, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00083733, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.210665801632817, + "language_loss": 0.67286122, + "learning_rate": 2.824641672639794e-06, + "loss": 0.69587147, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.534329414367676 + }, + { + "auxiliary_loss_clip": 0.01123899, + "auxiliary_loss_mlp": 0.01129535, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00088477, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 3.1990246089683003, + "language_loss": 0.74622762, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76876199, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.6692252159118652 + }, + { + "auxiliary_loss_clip": 0.01154312, + "auxiliary_loss_mlp": 0.01130154, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00093126, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.3429425133896447, + "language_loss": 0.76315057, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78599524, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.532102346420288 + }, + { + "auxiliary_loss_clip": 0.01151641, + "auxiliary_loss_mlp": 0.01112226, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00007367, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9134977577592676, + "language_loss": 0.67036593, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69300461, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 2.9758734703063965 + }, + { + "auxiliary_loss_clip": 0.01123877, + "auxiliary_loss_mlp": 0.01129149, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00087965, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.5497827637237576, + "language_loss": 0.72557914, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74810934, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.6439645290374756 + }, + { + "auxiliary_loss_clip": 0.01170984, + "auxiliary_loss_mlp": 0.01129546, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.001086, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.4567620306952425, + "language_loss": 0.81259203, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83559734, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.6093807220458984 + }, + { + "auxiliary_loss_clip": 0.0113952, + "auxiliary_loss_mlp": 0.01128855, + "balance_loss_clip": 1.00220966, + "balance_loss_mlp": 1.00087237, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.8669142762800077, + "language_loss": 0.76548481, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78816855, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 4.25213623046875 + }, + { + "auxiliary_loss_clip": 0.01139317, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_clip": 1.00208652, + "balance_loss_mlp": 1.0009259, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.6753942627703835, + "language_loss": 0.7625798, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78527927, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.7019028663635254 + }, + { + "auxiliary_loss_clip": 0.01107187, + "auxiliary_loss_mlp": 0.01130722, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.0013082, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6358952844884365, + "language_loss": 0.700948, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72332704, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.8524460792541504 + }, + { + "auxiliary_loss_clip": 0.01154545, + "auxiliary_loss_mlp": 0.01129858, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00082588, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.7312906286912046, + "language_loss": 0.84075189, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.8635959, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.5277099609375 + }, + { + "auxiliary_loss_clip": 0.01155827, + "auxiliary_loss_mlp": 0.01129798, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.0007658, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 1.904841145041828, + "language_loss": 0.60997581, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63283205, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.4951982498168945 + }, + { + "auxiliary_loss_clip": 0.0112264, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00061393, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.0217326052533666, + "language_loss": 0.71616077, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73868835, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.664977788925171 + }, + { + "auxiliary_loss_clip": 0.01156, + "auxiliary_loss_mlp": 0.01130252, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00083804, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.6925335461282818, + "language_loss": 0.81281632, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83567882, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 3.909982204437256 + }, + { + "auxiliary_loss_clip": 0.01160671, + "auxiliary_loss_mlp": 0.01129885, + "balance_loss_clip": 1.00241256, + "balance_loss_mlp": 1.00094819, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 1.9791524552579858, + "language_loss": 0.71015322, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73305881, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.5342400074005127 + }, + { + "auxiliary_loss_clip": 0.01151167, + "auxiliary_loss_mlp": 0.01112311, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00015914, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8872595790978929, + "language_loss": 0.59684378, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61947858, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.2004799842834473 + }, + { + "auxiliary_loss_clip": 0.01170955, + "auxiliary_loss_mlp": 0.01130021, + "balance_loss_clip": 1.00210989, + "balance_loss_mlp": 1.00070238, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 2.9917707747337725, + "language_loss": 0.84610629, + "learning_rate": 2.819315942271794e-06, + "loss": 0.86911601, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 3.953737497329712 + }, + { + "auxiliary_loss_clip": 0.0117097, + "auxiliary_loss_mlp": 0.01129512, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.0007664, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.6830854074289794, + "language_loss": 0.79628098, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81928581, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.467775344848633 + }, + { + "auxiliary_loss_clip": 0.01171033, + "auxiliary_loss_mlp": 0.0074857, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00149369, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.9515296309935526, + "language_loss": 0.67356694, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69276297, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.5063045024871826 + }, + { + "auxiliary_loss_clip": 0.01139024, + "auxiliary_loss_mlp": 0.01130246, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00102293, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.7249003618938212, + "language_loss": 0.73078787, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75348055, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.666048526763916 + }, + { + "auxiliary_loss_clip": 0.01120881, + "auxiliary_loss_mlp": 0.01129107, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00093305, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.777009766812019, + "language_loss": 0.72400534, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74650514, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 4.079442977905273 + }, + { + "auxiliary_loss_clip": 0.01170822, + "auxiliary_loss_mlp": 0.01128715, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00054169, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 2.0793520295056735, + "language_loss": 0.82538527, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84838068, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.466893196105957 + }, + { + "auxiliary_loss_clip": 0.0110709, + "auxiliary_loss_mlp": 0.01129323, + "balance_loss_clip": 1.00193846, + "balance_loss_mlp": 1.00086331, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 2.0425293274190874, + "language_loss": 0.83164966, + "learning_rate": 2.817183690261189e-06, + "loss": 0.85401374, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.653648853302002 + }, + { + "auxiliary_loss_clip": 0.01143894, + "auxiliary_loss_mlp": 0.01129386, + "balance_loss_clip": 1.00229859, + "balance_loss_mlp": 1.00092578, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.7141957491271949, + "language_loss": 0.69715679, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71988958, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.665219306945801 + }, + { + "auxiliary_loss_clip": 0.01138928, + "auxiliary_loss_mlp": 0.01129348, + "balance_loss_clip": 1.00193155, + "balance_loss_mlp": 1.00088859, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 2.1477966166364606, + "language_loss": 0.78991544, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81259823, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.552563190460205 + }, + { + "auxiliary_loss_clip": 0.01154498, + "auxiliary_loss_mlp": 0.01129651, + "balance_loss_clip": 1.00215125, + "balance_loss_mlp": 1.00080955, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.245407983821428, + "language_loss": 0.83708966, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.85993111, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.5075573921203613 + }, + { + "auxiliary_loss_clip": 0.01151707, + "auxiliary_loss_mlp": 0.01111388, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 0.99999839, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8627806381648986, + "language_loss": 0.64954853, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67217946, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.1896560192108154 + }, + { + "auxiliary_loss_clip": 0.01137818, + "auxiliary_loss_mlp": 0.01129516, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00096035, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.3987518035493671, + "language_loss": 0.7374199, + "learning_rate": 2.8154059613008e-06, + "loss": 0.76009321, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.5684385299682617 + }, + { + "auxiliary_loss_clip": 0.01107102, + "auxiliary_loss_mlp": 0.01130658, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00114942, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 3.216460856257918, + "language_loss": 0.69933742, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72171503, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.6956112384796143 + }, + { + "auxiliary_loss_clip": 0.01103306, + "auxiliary_loss_mlp": 0.00746703, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 0.99996299, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6867708349970011, + "language_loss": 0.60504991, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62355, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.423442840576172 + }, + { + "auxiliary_loss_clip": 0.01108287, + "auxiliary_loss_mlp": 0.01128593, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00061011, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9600272549166542, + "language_loss": 0.77624452, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79861337, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.8659939765930176 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.01130117, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00079882, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.9688014550220625, + "language_loss": 0.7794292, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80196506, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.760999917984009 + }, + { + "auxiliary_loss_clip": 0.01168304, + "auxiliary_loss_mlp": 0.01111426, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.00003648, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.798078192489029, + "language_loss": 0.61308765, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63588488, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.947477102279663 + }, + { + "auxiliary_loss_clip": 0.0112254, + "auxiliary_loss_mlp": 0.01129995, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00096309, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 1.9303833447351337, + "language_loss": 0.78017807, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.80270338, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.6517114639282227 + }, + { + "auxiliary_loss_clip": 0.0113752, + "auxiliary_loss_mlp": 0.01128889, + "balance_loss_clip": 1.00205207, + "balance_loss_mlp": 1.00071549, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.5670983750167342, + "language_loss": 0.79520518, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81786925, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.6332595348358154 + }, + { + "auxiliary_loss_clip": 0.01160295, + "auxiliary_loss_mlp": 0.00748424, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00132465, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.628156068638138, + "language_loss": 0.78955835, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.80864555, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.531153678894043 + }, + { + "auxiliary_loss_clip": 0.01137605, + "auxiliary_loss_mlp": 0.01129215, + "balance_loss_clip": 1.0017494, + "balance_loss_mlp": 1.00085056, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 1.669121399317418, + "language_loss": 0.80129087, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82395911, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.5482900142669678 + }, + { + "auxiliary_loss_clip": 0.01138819, + "auxiliary_loss_mlp": 0.01128947, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.00077331, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9530610203081116, + "language_loss": 0.79413086, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81680852, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.608891010284424 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01129473, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00082183, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 1.9842634256904148, + "language_loss": 0.67594945, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69862598, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.591840982437134 + }, + { + "auxiliary_loss_clip": 0.01122106, + "auxiliary_loss_mlp": 0.01129477, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00101721, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.1754604283917134, + "language_loss": 0.81185341, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83436924, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.5874438285827637 + }, + { + "auxiliary_loss_clip": 0.01137339, + "auxiliary_loss_mlp": 0.01129203, + "balance_loss_clip": 1.00180185, + "balance_loss_mlp": 1.00083804, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.1052864627496293, + "language_loss": 0.72349632, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74616176, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.5527570247650146 + }, + { + "auxiliary_loss_clip": 0.0113695, + "auxiliary_loss_mlp": 0.01128922, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.0011301, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6415071694072991, + "language_loss": 0.66863114, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.69128984, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.547233819961548 + }, + { + "auxiliary_loss_clip": 0.01155122, + "auxiliary_loss_mlp": 0.0112923, + "balance_loss_clip": 1.00205195, + "balance_loss_mlp": 1.00077045, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.8862936251021634, + "language_loss": 0.68774533, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71058881, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.6929686069488525 + }, + { + "auxiliary_loss_clip": 0.01121994, + "auxiliary_loss_mlp": 0.01128668, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00087523, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.5303900587984947, + "language_loss": 0.72524369, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74775034, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.62328839302063 + }, + { + "auxiliary_loss_clip": 0.01123656, + "auxiliary_loss_mlp": 0.00748654, + "balance_loss_clip": 1.00179303, + "balance_loss_mlp": 1.00162721, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.007313902070209, + "language_loss": 0.80471629, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82343936, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.662163734436035 + }, + { + "auxiliary_loss_clip": 0.01155399, + "auxiliary_loss_mlp": 0.01129141, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00077605, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 1.8288313542223082, + "language_loss": 0.74769819, + "learning_rate": 2.80899974864781e-06, + "loss": 0.77054358, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 4.088754892349243 + }, + { + "auxiliary_loss_clip": 0.0110858, + "auxiliary_loss_mlp": 0.01128896, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00091326, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.166173650706351, + "language_loss": 0.70218432, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72455907, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.6485180854797363 + }, + { + "auxiliary_loss_clip": 0.01138316, + "auxiliary_loss_mlp": 0.01129052, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00116444, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.2339426297878897, + "language_loss": 0.84334099, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86601472, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.546402931213379 + }, + { + "auxiliary_loss_clip": 0.01140746, + "auxiliary_loss_mlp": 0.01129744, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00099826, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.423891467415185, + "language_loss": 0.81311136, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83581626, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.5848729610443115 + }, + { + "auxiliary_loss_clip": 0.01119827, + "auxiliary_loss_mlp": 0.01111374, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 0.99998492, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7103694376231119, + "language_loss": 0.58816123, + "learning_rate": 2.807574793260416e-06, + "loss": 0.61047322, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.2490296363830566 + }, + { + "auxiliary_loss_clip": 0.01093355, + "auxiliary_loss_mlp": 0.01129115, + "balance_loss_clip": 1.00178492, + "balance_loss_mlp": 1.00084591, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.7230266396311849, + "language_loss": 0.79304719, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81527197, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 4.065282344818115 + }, + { + "auxiliary_loss_clip": 0.01155984, + "auxiliary_loss_mlp": 0.01130011, + "balance_loss_clip": 1.00199902, + "balance_loss_mlp": 1.0009793, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 2.406351500240236, + "language_loss": 0.80687296, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82973289, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.5428497791290283 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01129708, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.0007714, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.7193182731943204, + "language_loss": 0.70588553, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72855759, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.601893663406372 + }, + { + "auxiliary_loss_clip": 0.01123193, + "auxiliary_loss_mlp": 0.01130442, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.0008378, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 2.346715925951874, + "language_loss": 0.77475405, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79729038, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.587893486022949 + }, + { + "auxiliary_loss_clip": 0.01154183, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00072038, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.7144415359424776, + "language_loss": 0.79711258, + "learning_rate": 2.805792910102915e-06, + "loss": 0.8199414, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.5258848667144775 + }, + { + "auxiliary_loss_clip": 0.01137378, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_clip": 1.0020963, + "balance_loss_mlp": 1.00093353, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.646573763041785, + "language_loss": 0.76808292, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79073632, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 4.366809606552124 + }, + { + "auxiliary_loss_clip": 0.01137435, + "auxiliary_loss_mlp": 0.01127979, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00085437, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 1.882745149372722, + "language_loss": 0.81912732, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84178138, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.5430727005004883 + }, + { + "auxiliary_loss_clip": 0.01140383, + "auxiliary_loss_mlp": 0.00748396, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00153017, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.386105182966527, + "language_loss": 0.7558254, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77471316, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 4.091139316558838 + }, + { + "auxiliary_loss_clip": 0.01170779, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_clip": 1.00215089, + "balance_loss_mlp": 1.00074077, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.37893678077352, + "language_loss": 0.73866904, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76166213, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.513784646987915 + }, + { + "auxiliary_loss_clip": 0.01154243, + "auxiliary_loss_mlp": 0.01129373, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00100827, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.0123373201546704, + "language_loss": 0.82109469, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84393084, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.525853157043457 + }, + { + "auxiliary_loss_clip": 0.01170993, + "auxiliary_loss_mlp": 0.01129348, + "balance_loss_clip": 1.00214434, + "balance_loss_mlp": 1.00117397, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.1552411492581656, + "language_loss": 0.81171906, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8347224, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.459839105606079 + }, + { + "auxiliary_loss_clip": 0.01122503, + "auxiliary_loss_mlp": 0.0112902, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00075114, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.6326813354561724, + "language_loss": 0.83683169, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85934693, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.58435320854187 + }, + { + "auxiliary_loss_clip": 0.01134321, + "auxiliary_loss_mlp": 0.01110838, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00021207, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7613667908675135, + "language_loss": 0.5018146, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52426624, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.1563453674316406 + }, + { + "auxiliary_loss_clip": 0.01123853, + "auxiliary_loss_mlp": 0.0074834, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00135469, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.4366127867033995, + "language_loss": 0.78806132, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80678326, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.6393322944641113 + }, + { + "auxiliary_loss_clip": 0.01155808, + "auxiliary_loss_mlp": 0.0112867, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00087738, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.8103443754632196, + "language_loss": 0.81140816, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83425289, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.507648468017578 + }, + { + "auxiliary_loss_clip": 0.01137152, + "auxiliary_loss_mlp": 0.0112888, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00089729, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.702110612320238, + "language_loss": 0.7733134, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79597372, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.590301752090454 + }, + { + "auxiliary_loss_clip": 0.01138927, + "auxiliary_loss_mlp": 0.01128366, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00086021, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.620492564008382, + "language_loss": 0.75935364, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78202665, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 2.5654373168945312 + }, + { + "auxiliary_loss_clip": 0.0113907, + "auxiliary_loss_mlp": 0.01129467, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00072098, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.4992675529263535, + "language_loss": 0.75895405, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78163934, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.567246913909912 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.00748329, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00123477, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.557401893465927, + "language_loss": 0.78513157, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80383658, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.630668878555298 + }, + { + "auxiliary_loss_clip": 0.01170859, + "auxiliary_loss_mlp": 0.01129744, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00099778, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.139010359545689, + "language_loss": 0.77690649, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79991251, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.562713861465454 + }, + { + "auxiliary_loss_clip": 0.0117065, + "auxiliary_loss_mlp": 0.01127867, + "balance_loss_clip": 1.00204003, + "balance_loss_mlp": 1.00055158, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.7256285339227053, + "language_loss": 0.76524115, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78822637, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.5130019187927246 + }, + { + "auxiliary_loss_clip": 0.01139369, + "auxiliary_loss_mlp": 0.0112909, + "balance_loss_clip": 1.00205553, + "balance_loss_mlp": 1.00101173, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.455441022806742, + "language_loss": 0.79768789, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82037252, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.622197389602661 + }, + { + "auxiliary_loss_clip": 0.01155875, + "auxiliary_loss_mlp": 0.01129159, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.00089002, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 3.2447137420916667, + "language_loss": 0.71664017, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73949051, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.55244779586792 + }, + { + "auxiliary_loss_clip": 0.01171012, + "auxiliary_loss_mlp": 0.01129575, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00092423, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.5071979119524366, + "language_loss": 0.77620447, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79921031, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.494821071624756 + }, + { + "auxiliary_loss_clip": 0.01170826, + "auxiliary_loss_mlp": 0.01128333, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00073087, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.560980055646427, + "language_loss": 0.75519639, + "learning_rate": 2.798657755439662e-06, + "loss": 0.77818799, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.4963457584381104 + }, + { + "auxiliary_loss_clip": 0.01089223, + "auxiliary_loss_mlp": 0.01129342, + "balance_loss_clip": 1.00165391, + "balance_loss_mlp": 1.00069106, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.2770963515413585, + "language_loss": 0.6045388, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672448, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.6921961307525635 + }, + { + "auxiliary_loss_clip": 0.01170735, + "auxiliary_loss_mlp": 0.01129387, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00083148, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.0874879378292226, + "language_loss": 0.79712105, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82012224, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.4871175289154053 + }, + { + "auxiliary_loss_clip": 0.01107671, + "auxiliary_loss_mlp": 0.01128838, + "balance_loss_clip": 1.00169599, + "balance_loss_mlp": 1.00075948, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8917359325690253, + "language_loss": 0.81418872, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83655387, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.6908621788024902 + }, + { + "auxiliary_loss_clip": 0.01139021, + "auxiliary_loss_mlp": 0.01128445, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00065255, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.9970381506904769, + "language_loss": 0.61812067, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.64079535, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.5620572566986084 + }, + { + "auxiliary_loss_clip": 0.01154258, + "auxiliary_loss_mlp": 0.01128264, + "balance_loss_clip": 1.00211835, + "balance_loss_mlp": 1.00075758, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.643113877074254, + "language_loss": 0.86373222, + "learning_rate": 2.796872069720717e-06, + "loss": 0.8865574, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.567708730697632 + }, + { + "auxiliary_loss_clip": 0.01154081, + "auxiliary_loss_mlp": 0.01128992, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00081766, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.6759582602978678, + "language_loss": 0.71374536, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73657608, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.595845937728882 + }, + { + "auxiliary_loss_clip": 0.01125232, + "auxiliary_loss_mlp": 0.01128924, + "balance_loss_clip": 1.00199282, + "balance_loss_mlp": 1.00074983, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 1.8987577436848468, + "language_loss": 0.7611351, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78367662, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.638115406036377 + }, + { + "auxiliary_loss_clip": 0.01120812, + "auxiliary_loss_mlp": 0.01129001, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.00063634, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 2.0250065910669286, + "language_loss": 0.70781314, + "learning_rate": 2.795800295571382e-06, + "loss": 0.73031127, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.6072118282318115 + }, + { + "auxiliary_loss_clip": 0.01137136, + "auxiliary_loss_mlp": 0.01128176, + "balance_loss_clip": 1.00178814, + "balance_loss_mlp": 1.00067008, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 2.1170728652000546, + "language_loss": 0.69880497, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.72145808, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 4.079679012298584 + }, + { + "auxiliary_loss_clip": 0.01122562, + "auxiliary_loss_mlp": 0.01128617, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00082469, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.0045979810637555, + "language_loss": 0.77998662, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.8024984, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.6575169563293457 + }, + { + "auxiliary_loss_clip": 0.01124077, + "auxiliary_loss_mlp": 0.0112856, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00067222, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.5299907786375464, + "language_loss": 0.69671166, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71923804, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.6647162437438965 + }, + { + "auxiliary_loss_clip": 0.01122274, + "auxiliary_loss_mlp": 0.0112835, + "balance_loss_clip": 1.00168407, + "balance_loss_mlp": 1.00065315, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.453749551697675, + "language_loss": 0.83152843, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85403466, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.620774745941162 + }, + { + "auxiliary_loss_clip": 0.01143544, + "auxiliary_loss_mlp": 0.01127636, + "balance_loss_clip": 1.00218558, + "balance_loss_mlp": 1.00070214, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 2.109791068024028, + "language_loss": 0.84178084, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.8644926, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.5724475383758545 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01128267, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00057006, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.5386640972224028, + "language_loss": 0.74915177, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77165443, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 4.012285470962524 + }, + { + "auxiliary_loss_clip": 0.01120607, + "auxiliary_loss_mlp": 0.00748396, + "balance_loss_clip": 1.00162876, + "balance_loss_mlp": 1.00129151, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.52861305684468, + "language_loss": 0.74851906, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76720911, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.7145891189575195 + }, + { + "auxiliary_loss_clip": 0.01108192, + "auxiliary_loss_mlp": 0.01128782, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00089478, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.5469264577711723, + "language_loss": 0.6778537, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70022345, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.6614792346954346 + }, + { + "auxiliary_loss_clip": 0.01123169, + "auxiliary_loss_mlp": 0.01128205, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00079441, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 2.2943827323699986, + "language_loss": 0.76261324, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78512704, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.6555840969085693 + }, + { + "auxiliary_loss_clip": 0.01137022, + "auxiliary_loss_mlp": 0.01129005, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00111771, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8798822878315258, + "language_loss": 0.70616972, + "learning_rate": 2.792225755635257e-06, + "loss": 0.72882998, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 4.074125051498413 + }, + { + "auxiliary_loss_clip": 0.01170801, + "auxiliary_loss_mlp": 0.01128193, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00078201, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.6263483863345614, + "language_loss": 0.68828642, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71127635, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.5053036212921143 + }, + { + "auxiliary_loss_clip": 0.0114021, + "auxiliary_loss_mlp": 0.01129129, + "balance_loss_clip": 1.00211132, + "balance_loss_mlp": 1.00085962, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.8548834488902886, + "language_loss": 0.75783551, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78052884, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.5935423374176025 + }, + { + "auxiliary_loss_clip": 0.01138371, + "auxiliary_loss_mlp": 0.01110834, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.00020826, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.8434210252482114, + "language_loss": 0.58212578, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60461783, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 4.622219800949097 + }, + { + "auxiliary_loss_clip": 0.01105088, + "auxiliary_loss_mlp": 0.01128573, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00068533, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 2.5082843744658616, + "language_loss": 0.78002548, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80236208, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.6795802116394043 + }, + { + "auxiliary_loss_clip": 0.01155479, + "auxiliary_loss_mlp": 0.01128343, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00064564, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.8514810022213253, + "language_loss": 0.82398713, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8468253, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.5112788677215576 + }, + { + "auxiliary_loss_clip": 0.01170791, + "auxiliary_loss_mlp": 0.01128048, + "balance_loss_clip": 1.00213468, + "balance_loss_mlp": 1.00073206, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7802927156656208, + "language_loss": 0.79907358, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82206202, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.5031421184539795 + }, + { + "auxiliary_loss_clip": 0.01138892, + "auxiliary_loss_mlp": 0.01127705, + "balance_loss_clip": 1.00193059, + "balance_loss_mlp": 1.00048459, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.5367521551099923, + "language_loss": 0.83037794, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85304391, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.5628645420074463 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.01127474, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00073075, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.683498954874275, + "language_loss": 0.75490063, + "learning_rate": 2.789363960063863e-06, + "loss": 0.7775833, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.555739164352417 + }, + { + "auxiliary_loss_clip": 0.01121917, + "auxiliary_loss_mlp": 0.01128101, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00078583, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 2.148085055924083, + "language_loss": 0.79172289, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81422311, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.6229333877563477 + }, + { + "auxiliary_loss_clip": 0.01154046, + "auxiliary_loss_mlp": 0.0112796, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00064445, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.44487289346122, + "language_loss": 0.79925597, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82207602, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.5542728900909424 + }, + { + "auxiliary_loss_clip": 0.01155616, + "auxiliary_loss_mlp": 0.0112886, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00087702, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.9507111723543014, + "language_loss": 0.78200054, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80484527, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.5441834926605225 + }, + { + "auxiliary_loss_clip": 0.01107909, + "auxiliary_loss_mlp": 0.01127875, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00065446, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.218076636098696, + "language_loss": 0.85560501, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87796289, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.681058168411255 + }, + { + "auxiliary_loss_clip": 0.01143509, + "auxiliary_loss_mlp": 0.01128129, + "balance_loss_clip": 1.00204754, + "balance_loss_mlp": 1.00071812, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.5155181768076154, + "language_loss": 0.85249901, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87521535, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.651317834854126 + }, + { + "auxiliary_loss_clip": 0.01138757, + "auxiliary_loss_mlp": 0.01127374, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00053525, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.458425089137572, + "language_loss": 0.72972089, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75238222, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.589755058288574 + }, + { + "auxiliary_loss_clip": 0.01121852, + "auxiliary_loss_mlp": 0.01128861, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00078261, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 1.5699557218425586, + "language_loss": 0.68728369, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70979083, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.640005111694336 + }, + { + "auxiliary_loss_clip": 0.01137675, + "auxiliary_loss_mlp": 0.01127882, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00085294, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.701004676727828, + "language_loss": 0.80737865, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83003426, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.615738868713379 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01128181, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00076962, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 2.4596310215643493, + "language_loss": 0.89628166, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91911972, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.521329164505005 + }, + { + "auxiliary_loss_clip": 0.01121888, + "auxiliary_loss_mlp": 0.01128203, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.0006969, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 3.372876121252305, + "language_loss": 0.78531122, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80781215, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.6445629596710205 + }, + { + "auxiliary_loss_clip": 0.01139981, + "auxiliary_loss_mlp": 0.01127765, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00083089, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.795028548831594, + "language_loss": 0.74398434, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76666176, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.671499013900757 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.0112887, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00069666, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.875026536226635, + "language_loss": 0.75562453, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.77803034, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.6175484657287598 + }, + { + "auxiliary_loss_clip": 0.01154153, + "auxiliary_loss_mlp": 0.01129475, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00101495, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 1.8327082736070914, + "language_loss": 0.74612856, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76896477, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.5491161346435547 + }, + { + "auxiliary_loss_clip": 0.01170699, + "auxiliary_loss_mlp": 0.01128667, + "balance_loss_clip": 1.00205207, + "balance_loss_mlp": 1.00097001, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.6142733299408707, + "language_loss": 0.68276417, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70575786, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.5573573112487793 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01110773, + "balance_loss_clip": 1.00214291, + "balance_loss_mlp": 1.00014687, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6630257745182632, + "language_loss": 0.53947008, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56177688, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.2801904678344727 + }, + { + "auxiliary_loss_clip": 0.01121893, + "auxiliary_loss_mlp": 0.00748377, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00140166, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 3.160842291115516, + "language_loss": 0.69178581, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.7104885, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.615994691848755 + }, + { + "auxiliary_loss_clip": 0.01121275, + "auxiliary_loss_mlp": 0.01110122, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.00025904, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7270969280278798, + "language_loss": 0.51725227, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53956628, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.2460055351257324 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.01128723, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00083494, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.7344974452511088, + "language_loss": 0.73585749, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75868666, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.5823707580566406 + }, + { + "auxiliary_loss_clip": 0.01154316, + "auxiliary_loss_mlp": 0.01128824, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00084054, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 1.7938766217969635, + "language_loss": 0.69245017, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71528161, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.572751045227051 + }, + { + "auxiliary_loss_clip": 0.01153827, + "auxiliary_loss_mlp": 0.01128228, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00081718, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6730174227904384, + "language_loss": 0.78829795, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81111848, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 3.995332717895508 + }, + { + "auxiliary_loss_clip": 0.01137347, + "auxiliary_loss_mlp": 0.0112739, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00083756, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.343744944570619, + "language_loss": 0.80294704, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82559448, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.619272470474243 + }, + { + "auxiliary_loss_clip": 0.01137424, + "auxiliary_loss_mlp": 0.01127589, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00055933, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.6773699110224911, + "language_loss": 0.7112087, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73385882, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.554668664932251 + }, + { + "auxiliary_loss_clip": 0.01170664, + "auxiliary_loss_mlp": 0.01127551, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00071287, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.6585837527114853, + "language_loss": 0.82915473, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85213685, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.538118839263916 + }, + { + "auxiliary_loss_clip": 0.01170762, + "auxiliary_loss_mlp": 0.01128104, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.0006932, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 2.018101569040669, + "language_loss": 0.71665907, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73964775, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.542593002319336 + }, + { + "auxiliary_loss_clip": 0.01138068, + "auxiliary_loss_mlp": 0.01127735, + "balance_loss_clip": 1.00197256, + "balance_loss_mlp": 1.00080049, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.3556543292113674, + "language_loss": 0.75378436, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77644235, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.588768720626831 + }, + { + "auxiliary_loss_clip": 0.0116791, + "auxiliary_loss_mlp": 0.01109946, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00008321, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7570375766868406, + "language_loss": 0.56527507, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58805358, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 4.594398260116577 + }, + { + "auxiliary_loss_clip": 0.01154184, + "auxiliary_loss_mlp": 0.01127501, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.00085354, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.726895193788317, + "language_loss": 0.76046383, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78328073, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.531475305557251 + }, + { + "auxiliary_loss_clip": 0.01124807, + "auxiliary_loss_mlp": 0.01128373, + "balance_loss_clip": 1.0018152, + "balance_loss_mlp": 1.00077152, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.4689228821765594, + "language_loss": 0.82909882, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85163063, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.6032042503356934 + }, + { + "auxiliary_loss_clip": 0.01155289, + "auxiliary_loss_mlp": 0.01127961, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00064528, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 2.8425218652857303, + "language_loss": 0.76843256, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79126501, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.5198090076446533 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.01109953, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00008988, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7302177791659663, + "language_loss": 0.57758534, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.60004854, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 4.646712779998779 + }, + { + "auxiliary_loss_clip": 0.01170877, + "auxiliary_loss_mlp": 0.01128423, + "balance_loss_clip": 1.00212801, + "balance_loss_mlp": 1.00063062, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.5803573238033073, + "language_loss": 0.69469655, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71768951, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.5400850772857666 + }, + { + "auxiliary_loss_clip": 0.01108171, + "auxiliary_loss_mlp": 0.01129438, + "balance_loss_clip": 1.00206399, + "balance_loss_mlp": 1.00069189, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.1881037113276958, + "language_loss": 0.75438964, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7767657, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 4.106561899185181 + }, + { + "auxiliary_loss_clip": 0.01128758, + "auxiliary_loss_mlp": 0.01128744, + "balance_loss_clip": 1.00229859, + "balance_loss_mlp": 1.00085592, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.9214502136881777, + "language_loss": 0.77380687, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79638183, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.581202507019043 + }, + { + "auxiliary_loss_clip": 0.01125064, + "auxiliary_loss_mlp": 0.01128085, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00105524, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4233663365073228, + "language_loss": 0.80006748, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82259893, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.6682639122009277 + }, + { + "auxiliary_loss_clip": 0.0110587, + "auxiliary_loss_mlp": 0.01127536, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00079226, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.8985698871539383, + "language_loss": 0.70759135, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72992539, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.7255911827087402 + }, + { + "auxiliary_loss_clip": 0.01123884, + "auxiliary_loss_mlp": 0.01128203, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.0006969, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.6518502457118749, + "language_loss": 0.71724516, + "learning_rate": 2.776462273631956e-06, + "loss": 0.739766, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.7734570503234863 + }, + { + "auxiliary_loss_clip": 0.01154123, + "auxiliary_loss_mlp": 0.01128411, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00090516, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.6249249911004646, + "language_loss": 0.6157093, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63853467, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.6836767196655273 + }, + { + "auxiliary_loss_clip": 0.01171029, + "auxiliary_loss_mlp": 0.01129412, + "balance_loss_clip": 1.0022254, + "balance_loss_mlp": 1.00076151, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 6.562664804843587, + "language_loss": 0.6717068, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69471121, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.50353741645813 + }, + { + "auxiliary_loss_clip": 0.01170759, + "auxiliary_loss_mlp": 0.01127721, + "balance_loss_clip": 1.0020951, + "balance_loss_mlp": 1.00069165, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 2.316800822704668, + "language_loss": 0.78902173, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81200659, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.474623441696167 + }, + { + "auxiliary_loss_clip": 0.01153994, + "auxiliary_loss_mlp": 0.01129126, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00076127, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.075130705417047, + "language_loss": 0.70495963, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72779083, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.5315682888031006 + }, + { + "auxiliary_loss_clip": 0.01139104, + "auxiliary_loss_mlp": 0.01128381, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.0006839, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.788487628581299, + "language_loss": 0.76939094, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79206574, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 2.5594472885131836 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01128182, + "balance_loss_clip": 1.00192821, + "balance_loss_mlp": 1.00086689, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.4036451565408625, + "language_loss": 0.62438506, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64705485, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.675011396408081 + }, + { + "auxiliary_loss_clip": 0.0117073, + "auxiliary_loss_mlp": 0.01128002, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00078177, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6221660661053674, + "language_loss": 0.74033189, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76331919, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.5453224182128906 + }, + { + "auxiliary_loss_clip": 0.01137407, + "auxiliary_loss_mlp": 0.01127968, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.0009383, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 1.8836338347579558, + "language_loss": 0.81569707, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83835077, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.5526747703552246 + }, + { + "auxiliary_loss_clip": 0.01154025, + "auxiliary_loss_mlp": 0.01128411, + "balance_loss_clip": 1.00200355, + "balance_loss_mlp": 1.00090492, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 2.4227100667164017, + "language_loss": 0.69807255, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.7208969, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.5479886531829834 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01127912, + "balance_loss_clip": 1.0020436, + "balance_loss_mlp": 1.0007875, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.934383454732974, + "language_loss": 0.81863463, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84113514, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.6117401123046875 + }, + { + "auxiliary_loss_clip": 0.01137398, + "auxiliary_loss_mlp": 0.01127766, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00073695, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.6769640378938373, + "language_loss": 0.68479991, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70745158, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.6841986179351807 + }, + { + "auxiliary_loss_clip": 0.01140282, + "auxiliary_loss_mlp": 0.01128396, + "balance_loss_clip": 1.00194395, + "balance_loss_mlp": 1.00098562, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 3.2250633497741767, + "language_loss": 0.80476302, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.8274498, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.627688407897949 + }, + { + "auxiliary_loss_clip": 0.01155176, + "auxiliary_loss_mlp": 0.01128358, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00104284, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.425854178262224, + "language_loss": 0.75606251, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77889782, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.557093858718872 + }, + { + "auxiliary_loss_clip": 0.01167872, + "auxiliary_loss_mlp": 0.01110013, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00014925, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8147826084211816, + "language_loss": 0.6032843, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62606311, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 2.9977147579193115 + }, + { + "auxiliary_loss_clip": 0.01133907, + "auxiliary_loss_mlp": 0.01109917, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00005329, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7806035477582982, + "language_loss": 0.55496341, + "learning_rate": 2.771075272396981e-06, + "loss": 0.5774017, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.267896890640259 + }, + { + "auxiliary_loss_clip": 0.01143547, + "auxiliary_loss_mlp": 0.0112894, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.00105286, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9327974163854151, + "language_loss": 0.75991225, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78263712, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.7028329372406006 + }, + { + "auxiliary_loss_clip": 0.01155586, + "auxiliary_loss_mlp": 0.01128779, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00079632, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.0702489492453457, + "language_loss": 0.77936113, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80220473, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.517698049545288 + }, + { + "auxiliary_loss_clip": 0.01121983, + "auxiliary_loss_mlp": 0.01127855, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00082517, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 1.7755968409041503, + "language_loss": 0.68642038, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70891875, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.656582832336426 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01127456, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00080812, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.6211403621518574, + "language_loss": 0.68792617, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71058762, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.634709119796753 + }, + { + "auxiliary_loss_clip": 0.01154027, + "auxiliary_loss_mlp": 0.01128266, + "balance_loss_clip": 1.00211072, + "balance_loss_mlp": 1.0009501, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6859703995222062, + "language_loss": 0.78959954, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81242245, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.506256580352783 + }, + { + "auxiliary_loss_clip": 0.01109194, + "auxiliary_loss_mlp": 0.01110054, + "balance_loss_clip": 1.00226951, + "balance_loss_mlp": 1.00019062, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8019167183807894, + "language_loss": 0.61869913, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64089161, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.036963701248169 + }, + { + "auxiliary_loss_clip": 0.01137121, + "auxiliary_loss_mlp": 0.01128833, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00085032, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.677564075706728, + "language_loss": 0.68245918, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70511872, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.7128350734710693 + }, + { + "auxiliary_loss_clip": 0.0113743, + "auxiliary_loss_mlp": 0.01128286, + "balance_loss_clip": 1.00199211, + "balance_loss_mlp": 1.00087547, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.6339921998068423, + "language_loss": 0.72641134, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.7490685, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 4.015084743499756 + }, + { + "auxiliary_loss_clip": 0.01167861, + "auxiliary_loss_mlp": 0.01109938, + "balance_loss_clip": 1.002056, + "balance_loss_mlp": 1.00007463, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8289820387777495, + "language_loss": 0.60380495, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62658298, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.92352294921875 + }, + { + "auxiliary_loss_clip": 0.01153886, + "auxiliary_loss_mlp": 0.01127495, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00094235, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.4410273972800194, + "language_loss": 0.82551503, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84832889, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.5529696941375732 + }, + { + "auxiliary_loss_clip": 0.01138331, + "auxiliary_loss_mlp": 0.01127652, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00071788, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.4508726643383534, + "language_loss": 0.68969208, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71235192, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.654538154602051 + }, + { + "auxiliary_loss_clip": 0.01138789, + "auxiliary_loss_mlp": 0.01128799, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00091171, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.4512816042593113, + "language_loss": 0.75989759, + "learning_rate": 2.76676093244553e-06, + "loss": 0.78257352, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.6548655033111572 + }, + { + "auxiliary_loss_clip": 0.01122437, + "auxiliary_loss_mlp": 0.01126871, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00089049, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 2.2845626817756877, + "language_loss": 0.74860024, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.77109325, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 4.107873439788818 + }, + { + "auxiliary_loss_clip": 0.01137908, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_clip": 1.00220001, + "balance_loss_mlp": 1.00081611, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.590662808278278, + "language_loss": 0.81296808, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83563805, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.558028221130371 + }, + { + "auxiliary_loss_clip": 0.01155392, + "auxiliary_loss_mlp": 0.00748451, + "balance_loss_clip": 1.00200903, + "balance_loss_mlp": 1.00145686, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.694048919199528, + "language_loss": 0.84449792, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86353636, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.5124166011810303 + }, + { + "auxiliary_loss_clip": 0.01154808, + "auxiliary_loss_mlp": 0.00748191, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00109863, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5356493639143594, + "language_loss": 0.72831511, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74734509, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.544217824935913 + }, + { + "auxiliary_loss_clip": 0.01089404, + "auxiliary_loss_mlp": 0.01127863, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00073838, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.3840150333552697, + "language_loss": 0.77378666, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79595935, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 4.150060653686523 + }, + { + "auxiliary_loss_clip": 0.01127153, + "auxiliary_loss_mlp": 0.01128201, + "balance_loss_clip": 1.00232911, + "balance_loss_mlp": 1.00088525, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.587194706999102, + "language_loss": 0.81635964, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.8389132, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.6088359355926514 + }, + { + "auxiliary_loss_clip": 0.01153809, + "auxiliary_loss_mlp": 0.01126959, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00088286, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 6.882278787824181, + "language_loss": 0.80197942, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82478708, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 3.9216854572296143 + }, + { + "auxiliary_loss_clip": 0.01170737, + "auxiliary_loss_mlp": 0.01128619, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00101709, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.5733830399174629, + "language_loss": 0.71204478, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73503834, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.4709980487823486 + }, + { + "auxiliary_loss_clip": 0.0115395, + "auxiliary_loss_mlp": 0.00748264, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00116897, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.5469417720500729, + "language_loss": 0.63875502, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65777719, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.610351324081421 + }, + { + "auxiliary_loss_clip": 0.01145046, + "auxiliary_loss_mlp": 0.01127906, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00087678, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 1.7897405469864716, + "language_loss": 0.79422683, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81695634, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.701389789581299 + }, + { + "auxiliary_loss_clip": 0.01139172, + "auxiliary_loss_mlp": 0.01128124, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00080872, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 7.650075347795087, + "language_loss": 0.71394372, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73661673, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.5981311798095703 + }, + { + "auxiliary_loss_clip": 0.01170648, + "auxiliary_loss_mlp": 0.01127942, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00081754, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.737363117852709, + "language_loss": 0.84027243, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.8632583, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.574934244155884 + }, + { + "auxiliary_loss_clip": 0.01154148, + "auxiliary_loss_mlp": 0.01127621, + "balance_loss_clip": 1.00178814, + "balance_loss_mlp": 1.00078189, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.249817111859267, + "language_loss": 0.80638957, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.82920724, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.5943586826324463 + }, + { + "auxiliary_loss_clip": 0.01170838, + "auxiliary_loss_mlp": 0.01128289, + "balance_loss_clip": 1.00221848, + "balance_loss_mlp": 1.0009737, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 3.322404552558607, + "language_loss": 0.70903409, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73202538, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.467761278152466 + }, + { + "auxiliary_loss_clip": 0.01137744, + "auxiliary_loss_mlp": 0.01128395, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00079298, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8721803451999912, + "language_loss": 0.80602795, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82868934, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.5486984252929688 + }, + { + "auxiliary_loss_clip": 0.01143566, + "auxiliary_loss_mlp": 0.01128948, + "balance_loss_clip": 1.00220954, + "balance_loss_mlp": 1.00115585, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 2.0155563986394696, + "language_loss": 0.82774681, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85047197, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.5331742763519287 + }, + { + "auxiliary_loss_clip": 0.01154158, + "auxiliary_loss_mlp": 0.01128109, + "balance_loss_clip": 1.00207484, + "balance_loss_mlp": 1.00098431, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.122462469967496, + "language_loss": 0.79925948, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82208216, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 2.5172927379608154 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_clip": 1.00207603, + "balance_loss_mlp": 1.0010047, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.4557234480980998, + "language_loss": 0.81563216, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83829385, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.5863258838653564 + }, + { + "auxiliary_loss_clip": 0.01124028, + "auxiliary_loss_mlp": 0.01127702, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00095844, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9347288073801445, + "language_loss": 0.70220798, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72472525, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.6113381385803223 + }, + { + "auxiliary_loss_clip": 0.0115578, + "auxiliary_loss_mlp": 0.01128245, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00073898, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 1.9876983483615256, + "language_loss": 0.82979274, + "learning_rate": 2.759561073299676e-06, + "loss": 0.852633, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.5144193172454834 + }, + { + "auxiliary_loss_clip": 0.0112516, + "auxiliary_loss_mlp": 0.01127851, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00091672, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 2.2720627564522378, + "language_loss": 0.8359642, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.8584944, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.599903106689453 + }, + { + "auxiliary_loss_clip": 0.01170964, + "auxiliary_loss_mlp": 0.01128407, + "balance_loss_clip": 1.00211787, + "balance_loss_mlp": 1.00109124, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 1.6799722229147498, + "language_loss": 0.77611971, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79911345, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.545670747756958 + }, + { + "auxiliary_loss_clip": 0.01155365, + "auxiliary_loss_mlp": 0.01126701, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00091124, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.7916090394608146, + "language_loss": 0.8053875, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82820815, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.5059170722961426 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.01127452, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00061369, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.5893807087042098, + "language_loss": 0.84720862, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86970556, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.622488260269165 + }, + { + "auxiliary_loss_clip": 0.01091744, + "auxiliary_loss_mlp": 0.01128031, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00100112, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.6735409432932264, + "language_loss": 0.74643064, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76862836, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.728003740310669 + }, + { + "auxiliary_loss_clip": 0.01128481, + "auxiliary_loss_mlp": 0.01127628, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.0006938, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.4857599229766774, + "language_loss": 0.79772842, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82028949, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.608794927597046 + }, + { + "auxiliary_loss_clip": 0.01138747, + "auxiliary_loss_mlp": 0.01127472, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00101519, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.5844430288234024, + "language_loss": 0.77583653, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79849875, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.602789878845215 + }, + { + "auxiliary_loss_clip": 0.01121991, + "auxiliary_loss_mlp": 0.01127771, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00083685, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.5887843940048658, + "language_loss": 0.7504667, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77296436, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.6844964027404785 + }, + { + "auxiliary_loss_clip": 0.01155399, + "auxiliary_loss_mlp": 0.01127528, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00088, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.5782586204509654, + "language_loss": 0.67820096, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70103025, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.752786159515381 + }, + { + "auxiliary_loss_clip": 0.0109174, + "auxiliary_loss_mlp": 0.01128016, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00079584, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.084149916634752, + "language_loss": 0.71612239, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73831987, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.6681792736053467 + }, + { + "auxiliary_loss_clip": 0.01145015, + "auxiliary_loss_mlp": 0.0112852, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00101328, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.5728853057230907, + "language_loss": 0.74153924, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.7642746, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.5147511959075928 + }, + { + "auxiliary_loss_clip": 0.01170688, + "auxiliary_loss_mlp": 0.01127348, + "balance_loss_clip": 1.00202477, + "balance_loss_mlp": 1.00089073, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.223232367729727, + "language_loss": 0.84075767, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.863738, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.4784274101257324 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01128134, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00091422, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 2.3601148324990464, + "language_loss": 0.900244, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92293113, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.5866751670837402 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.01128213, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00089788, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7564881938918853, + "language_loss": 0.78111601, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80377078, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 4.091441869735718 + }, + { + "auxiliary_loss_clip": 0.01106925, + "auxiliary_loss_mlp": 0.0112774, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.0008055, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.1373864899161, + "language_loss": 0.68826169, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71060836, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.6232383251190186 + }, + { + "auxiliary_loss_clip": 0.01154063, + "auxiliary_loss_mlp": 0.0112766, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00082111, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.7530977764939688, + "language_loss": 0.58806801, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61088526, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.5729153156280518 + }, + { + "auxiliary_loss_clip": 0.01139915, + "auxiliary_loss_mlp": 0.01128259, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.00075316, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 1.6993370539950066, + "language_loss": 0.69503999, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.7177217, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.5522565841674805 + }, + { + "auxiliary_loss_clip": 0.01170687, + "auxiliary_loss_mlp": 0.00748136, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00112224, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 5.675878525253995, + "language_loss": 0.75713181, + "learning_rate": 2.753071346464642e-06, + "loss": 0.77632004, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.4644217491149902 + }, + { + "auxiliary_loss_clip": 0.01113251, + "auxiliary_loss_mlp": 0.00748178, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00128317, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 2.448212991092901, + "language_loss": 0.65977937, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.6783936, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 4.020548582077026 + }, + { + "auxiliary_loss_clip": 0.01121919, + "auxiliary_loss_mlp": 0.01128466, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.00086403, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 1.899027381809121, + "language_loss": 0.72861886, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75112271, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.683590888977051 + }, + { + "auxiliary_loss_clip": 0.01122199, + "auxiliary_loss_mlp": 0.01127507, + "balance_loss_clip": 1.00173354, + "balance_loss_mlp": 1.00095427, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 4.160835482401876, + "language_loss": 0.7324357, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75493276, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.658116579055786 + }, + { + "auxiliary_loss_clip": 0.01145007, + "auxiliary_loss_mlp": 0.01127736, + "balance_loss_clip": 1.00213242, + "balance_loss_mlp": 1.00080228, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 1.858701176914576, + "language_loss": 0.71231794, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73504543, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 4.077322006225586 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01109106, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.0000056, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.8884104440165717, + "language_loss": 0.6111148, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63329285, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.0286686420440674 + }, + { + "auxiliary_loss_clip": 0.01136702, + "auxiliary_loss_mlp": 0.00748292, + "balance_loss_clip": 1.00187385, + "balance_loss_mlp": 1.00131309, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 2.2449906563590445, + "language_loss": 0.81546175, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83431172, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.6096115112304688 + }, + { + "auxiliary_loss_clip": 0.01137673, + "auxiliary_loss_mlp": 0.01127558, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00071919, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.080268696012642, + "language_loss": 0.70744681, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.73009908, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 3.953044891357422 + }, + { + "auxiliary_loss_clip": 0.01155537, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00107193, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.8621691208473123, + "language_loss": 0.75750595, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78033847, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.5359411239624023 + }, + { + "auxiliary_loss_clip": 0.01058175, + "auxiliary_loss_mlp": 0.01127517, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00105977, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.591184829279263, + "language_loss": 0.7830385, + "learning_rate": 2.749823008443152e-06, + "loss": 0.8048954, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 2.8936519622802734 + }, + { + "auxiliary_loss_clip": 0.01089908, + "auxiliary_loss_mlp": 0.0112671, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00063443, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 2.2935523345626683, + "language_loss": 0.69623882, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71840501, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 2.999018430709839 + }, + { + "auxiliary_loss_clip": 0.01081342, + "auxiliary_loss_mlp": 0.01127969, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00084448, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.5001381455489782, + "language_loss": 0.77718508, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.79927814, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.727235794067383 + }, + { + "auxiliary_loss_clip": 0.01119258, + "auxiliary_loss_mlp": 0.01109836, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 0.99997276, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9429123409993242, + "language_loss": 0.6304, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65269095, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.2472379207611084 + }, + { + "auxiliary_loss_clip": 0.01121692, + "auxiliary_loss_mlp": 0.01127951, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00101733, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 1.9279283248687487, + "language_loss": 0.63373476, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65623122, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.6599068641662598 + }, + { + "auxiliary_loss_clip": 0.0115524, + "auxiliary_loss_mlp": 0.01126483, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00088429, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.065668239568068, + "language_loss": 0.78567898, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80849624, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 2.5973591804504395 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.00748251, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.0012846, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.7954436019715192, + "language_loss": 0.67029691, + "learning_rate": 2.747656169644941e-06, + "loss": 0.6890142, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.700559616088867 + }, + { + "auxiliary_loss_clip": 0.01170723, + "auxiliary_loss_mlp": 0.01127919, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00108075, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 3.0100304259062067, + "language_loss": 0.78834832, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81133473, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.5161337852478027 + }, + { + "auxiliary_loss_clip": 0.01129458, + "auxiliary_loss_mlp": 0.01127654, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00071979, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.7513567597193112, + "language_loss": 0.72651255, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74908364, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.6648991107940674 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01127338, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00078559, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.0281590346889247, + "language_loss": 0.85570979, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87822199, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.5980660915374756 + }, + { + "auxiliary_loss_clip": 0.01122723, + "auxiliary_loss_mlp": 0.011281, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00097489, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.4683053682870773, + "language_loss": 0.70030421, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72281241, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.5709140300750732 + }, + { + "auxiliary_loss_clip": 0.01170673, + "auxiliary_loss_mlp": 0.01127809, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00106549, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.131288851998476, + "language_loss": 0.83472651, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85771132, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.459402322769165 + }, + { + "auxiliary_loss_clip": 0.01139673, + "auxiliary_loss_mlp": 0.01127089, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00091779, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4804633195422152, + "language_loss": 0.72861195, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75127959, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.546391487121582 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01126384, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00078547, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 2.212320488923912, + "language_loss": 0.82367218, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84631789, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.616185426712036 + }, + { + "auxiliary_loss_clip": 0.01170662, + "auxiliary_loss_mlp": 0.01126986, + "balance_loss_clip": 1.00205266, + "balance_loss_mlp": 1.00071907, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.498590431127523, + "language_loss": 0.73670137, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75967783, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.5194849967956543 + }, + { + "auxiliary_loss_clip": 0.01122388, + "auxiliary_loss_mlp": 0.01127462, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00100505, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 4.987976176691704, + "language_loss": 0.74247795, + "learning_rate": 2.744403998666805e-06, + "loss": 0.7649765, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.6387314796447754 + }, + { + "auxiliary_loss_clip": 0.01155815, + "auxiliary_loss_mlp": 0.01127717, + "balance_loss_clip": 1.00215459, + "balance_loss_mlp": 1.00087833, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.546084015777739, + "language_loss": 0.67988813, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70272344, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.74603009223938 + }, + { + "auxiliary_loss_clip": 0.01123726, + "auxiliary_loss_mlp": 0.01127762, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00082779, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.744084942654542, + "language_loss": 0.74227089, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76478577, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.652330160140991 + }, + { + "auxiliary_loss_clip": 0.01143479, + "auxiliary_loss_mlp": 0.01127897, + "balance_loss_clip": 1.00212049, + "balance_loss_mlp": 1.00086808, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.6299393880988633, + "language_loss": 0.7139343, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73664808, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.5787272453308105 + }, + { + "auxiliary_loss_clip": 0.0115536, + "auxiliary_loss_mlp": 0.0112674, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.00066423, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.5068586837484754, + "language_loss": 0.78647947, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80930042, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.576462507247925 + }, + { + "auxiliary_loss_clip": 0.01154091, + "auxiliary_loss_mlp": 0.01127717, + "balance_loss_clip": 1.00208819, + "balance_loss_mlp": 1.0006876, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.841527515508328, + "language_loss": 0.79613179, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81894994, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.7095754146575928 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01109865, + "balance_loss_clip": 1.00152719, + "balance_loss_mlp": 1.00000191, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8652809657746333, + "language_loss": 0.64919102, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67146152, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.0938453674316406 + }, + { + "auxiliary_loss_clip": 0.01140214, + "auxiliary_loss_mlp": 0.01127671, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00083256, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.6154944523767636, + "language_loss": 0.72424507, + "learning_rate": 2.741872951078109e-06, + "loss": 0.74692392, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.592069625854492 + }, + { + "auxiliary_loss_clip": 0.01153958, + "auxiliary_loss_mlp": 0.01127418, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00076985, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.6908883665916226, + "language_loss": 0.81682992, + "learning_rate": 2.741511260213862e-06, + "loss": 0.8396436, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.516759157180786 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01127482, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00073898, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 1.952549105816816, + "language_loss": 0.67175519, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69429791, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 4.167057752609253 + }, + { + "auxiliary_loss_clip": 0.01170781, + "auxiliary_loss_mlp": 0.01127727, + "balance_loss_clip": 1.00203633, + "balance_loss_mlp": 1.00088859, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.002964298060123, + "language_loss": 0.83710098, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86008608, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.5089011192321777 + }, + { + "auxiliary_loss_clip": 0.01170595, + "auxiliary_loss_mlp": 0.01126608, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00081885, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.657557896145214, + "language_loss": 0.72269738, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74566936, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.491227388381958 + }, + { + "auxiliary_loss_clip": 0.01137382, + "auxiliary_loss_mlp": 0.01126676, + "balance_loss_clip": 1.00191987, + "balance_loss_mlp": 1.00069594, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.6841442693607094, + "language_loss": 0.65335989, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67600042, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.6282174587249756 + }, + { + "auxiliary_loss_clip": 0.01167623, + "auxiliary_loss_mlp": 0.01109158, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00005758, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7687434149767892, + "language_loss": 0.58248436, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60525227, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.0640647411346436 + }, + { + "auxiliary_loss_clip": 0.01139101, + "auxiliary_loss_mlp": 0.01126679, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00079441, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.600294607151723, + "language_loss": 0.7909503, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81360805, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 3.924171209335327 + }, + { + "auxiliary_loss_clip": 0.01137885, + "auxiliary_loss_mlp": 0.01126395, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00070107, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.6964553995076441, + "language_loss": 0.78139615, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80403894, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.5783238410949707 + }, + { + "auxiliary_loss_clip": 0.01138632, + "auxiliary_loss_mlp": 0.01127266, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00071371, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.9521387662670044, + "language_loss": 0.74894392, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77160287, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.5851030349731445 + }, + { + "auxiliary_loss_clip": 0.01122072, + "auxiliary_loss_mlp": 0.01127255, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00098896, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.8153959371286608, + "language_loss": 0.79628527, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81877851, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.01170674, + "auxiliary_loss_mlp": 0.01127501, + "balance_loss_clip": 1.00206172, + "balance_loss_mlp": 1.00094795, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 1.9642138360775792, + "language_loss": 0.83943081, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.86241251, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 3.9436795711517334 + }, + { + "auxiliary_loss_clip": 0.01153655, + "auxiliary_loss_mlp": 0.01126992, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00082123, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.2946831491595963, + "language_loss": 0.86839676, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89120328, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.5101075172424316 + }, + { + "auxiliary_loss_clip": 0.0109178, + "auxiliary_loss_mlp": 0.00748389, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00145841, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3407343539068934, + "language_loss": 0.83430761, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85270929, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.65515398979187 + }, + { + "auxiliary_loss_clip": 0.011282, + "auxiliary_loss_mlp": 0.00748196, + "balance_loss_clip": 1.00214684, + "balance_loss_mlp": 1.00143647, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.5823531311784926, + "language_loss": 0.8299948, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84875876, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 4.013158082962036 + }, + { + "auxiliary_loss_clip": 0.01123514, + "auxiliary_loss_mlp": 0.01128064, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00103509, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.5271116352953087, + "language_loss": 0.70941132, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73192704, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.6428163051605225 + }, + { + "auxiliary_loss_clip": 0.01121448, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00085282, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.735345430723204, + "language_loss": 0.80702883, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82950503, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.6247365474700928 + }, + { + "auxiliary_loss_clip": 0.01088271, + "auxiliary_loss_mlp": 0.01126705, + "balance_loss_clip": 1.00164485, + "balance_loss_mlp": 1.00062954, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 2.423132815643904, + "language_loss": 0.75312209, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77527189, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.6636266708374023 + }, + { + "auxiliary_loss_clip": 0.01123188, + "auxiliary_loss_mlp": 0.01126865, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.0006938, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 1.826038097730753, + "language_loss": 0.71729696, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73979747, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.5838940143585205 + }, + { + "auxiliary_loss_clip": 0.01088721, + "auxiliary_loss_mlp": 0.00748212, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00146675, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8861828199303665, + "language_loss": 0.74930251, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76767182, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.7014973163604736 + }, + { + "auxiliary_loss_clip": 0.01138682, + "auxiliary_loss_mlp": 0.01126255, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00065601, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 9.893650132755887, + "language_loss": 0.81533301, + "learning_rate": 2.7346338069806e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.600942373275757 + }, + { + "auxiliary_loss_clip": 0.01139068, + "auxiliary_loss_mlp": 0.01126744, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00085878, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 3.002771834720952, + "language_loss": 0.75008965, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77274776, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.598618984222412 + }, + { + "auxiliary_loss_clip": 0.01127974, + "auxiliary_loss_mlp": 0.01127522, + "balance_loss_clip": 1.00177634, + "balance_loss_mlp": 1.0008738, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.731526540695049, + "language_loss": 0.65887934, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68143427, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.614326238632202 + }, + { + "auxiliary_loss_clip": 0.01153806, + "auxiliary_loss_mlp": 0.01125976, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.00075841, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.574522357097023, + "language_loss": 0.81302881, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83582664, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.551440954208374 + }, + { + "auxiliary_loss_clip": 0.01120502, + "auxiliary_loss_mlp": 0.0110862, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00028253, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7169918334641421, + "language_loss": 0.53142494, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55371618, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.2518980503082275 + }, + { + "auxiliary_loss_clip": 0.01137224, + "auxiliary_loss_mlp": 0.00748445, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00169051, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.435652430338769, + "language_loss": 0.75388086, + "learning_rate": 2.732822275578769e-06, + "loss": 0.7727375, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.5914270877838135 + }, + { + "auxiliary_loss_clip": 0.01093107, + "auxiliary_loss_mlp": 0.011266, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.00081062, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.5259470963447188, + "language_loss": 0.76074463, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.7829417, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.79268479347229 + }, + { + "auxiliary_loss_clip": 0.0112218, + "auxiliary_loss_mlp": 0.01126648, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00085807, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 1.9987695762599955, + "language_loss": 0.82194614, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.8444345, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.6392929553985596 + }, + { + "auxiliary_loss_clip": 0.01170612, + "auxiliary_loss_mlp": 0.0112698, + "balance_loss_clip": 1.00207388, + "balance_loss_mlp": 1.00071359, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 1.9315564800278993, + "language_loss": 0.77206957, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.561316967010498 + }, + { + "auxiliary_loss_clip": 0.0113701, + "auxiliary_loss_mlp": 0.01127101, + "balance_loss_clip": 1.00178373, + "balance_loss_mlp": 1.00073862, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.0326826504553996, + "language_loss": 0.72401899, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74666011, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.5680885314941406 + }, + { + "auxiliary_loss_clip": 0.01153757, + "auxiliary_loss_mlp": 0.01126876, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00070512, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.6587957255543389, + "language_loss": 0.66381437, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68662065, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.5571000576019287 + }, + { + "auxiliary_loss_clip": 0.01170417, + "auxiliary_loss_mlp": 0.01126249, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.0008409, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.1168909354820746, + "language_loss": 0.78158724, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80455387, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.466500759124756 + }, + { + "auxiliary_loss_clip": 0.01153909, + "auxiliary_loss_mlp": 0.01127026, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00056911, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.4225103955624334, + "language_loss": 0.69790846, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72071785, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.5430994033813477 + }, + { + "auxiliary_loss_clip": 0.01106952, + "auxiliary_loss_mlp": 0.01126774, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.0008893, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7821062191182409, + "language_loss": 0.71579134, + "learning_rate": 2.729922381038513e-06, + "loss": 0.7381286, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.6585257053375244 + }, + { + "auxiliary_loss_clip": 0.01121957, + "auxiliary_loss_mlp": 0.01126393, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00089002, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.9497136477535315, + "language_loss": 0.73826814, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76075166, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.667970657348633 + }, + { + "auxiliary_loss_clip": 0.01170529, + "auxiliary_loss_mlp": 0.01126553, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00066757, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 1.8237296031642818, + "language_loss": 0.65738916, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68035996, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.4734394550323486 + }, + { + "auxiliary_loss_clip": 0.0112293, + "auxiliary_loss_mlp": 0.01126183, + "balance_loss_clip": 1.00190163, + "balance_loss_mlp": 1.00077462, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.6989049835752883, + "language_loss": 0.75666934, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77916044, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.673938035964966 + }, + { + "auxiliary_loss_clip": 0.01170546, + "auxiliary_loss_mlp": 0.01126962, + "balance_loss_clip": 1.00206017, + "balance_loss_mlp": 1.00098109, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4563737768343088, + "language_loss": 0.71964926, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74262428, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.5356204509735107 + }, + { + "auxiliary_loss_clip": 0.01170506, + "auxiliary_loss_mlp": 0.01126765, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00078428, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.6986699608574038, + "language_loss": 0.73178369, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75475639, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.519725799560547 + }, + { + "auxiliary_loss_clip": 0.01118751, + "auxiliary_loss_mlp": 0.01109108, + "balance_loss_clip": 1.0015502, + "balance_loss_mlp": 1.00000751, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8557644572882135, + "language_loss": 0.60582596, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62810451, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.1262104511260986 + }, + { + "auxiliary_loss_clip": 0.01120101, + "auxiliary_loss_mlp": 0.01126623, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00092876, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 1.9671175272664165, + "language_loss": 0.66885686, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69132406, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 4.2821571826934814 + }, + { + "auxiliary_loss_clip": 0.01153735, + "auxiliary_loss_mlp": 0.01126561, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00086665, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.7858671958233727, + "language_loss": 0.89528954, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91809243, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.569964647293091 + }, + { + "auxiliary_loss_clip": 0.01139694, + "auxiliary_loss_mlp": 0.01125648, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00071633, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5372673423389713, + "language_loss": 0.73623395, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75888735, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.662559747695923 + }, + { + "auxiliary_loss_clip": 0.01170632, + "auxiliary_loss_mlp": 0.01126697, + "balance_loss_clip": 1.00202632, + "balance_loss_mlp": 1.00081229, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.6948186079780045, + "language_loss": 0.73396242, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75693572, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.5712411403656006 + }, + { + "auxiliary_loss_clip": 0.01170677, + "auxiliary_loss_mlp": 0.01127671, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00092793, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4163968981337542, + "language_loss": 0.79702628, + "learning_rate": 2.725932135056117e-06, + "loss": 0.82000971, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 4.132418155670166 + }, + { + "auxiliary_loss_clip": 0.01155524, + "auxiliary_loss_mlp": 0.01127232, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00087023, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.7515553891887445, + "language_loss": 0.77446604, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79729354, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.5853018760681152 + }, + { + "auxiliary_loss_clip": 0.01170244, + "auxiliary_loss_mlp": 0.01125206, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00084734, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.8453376598039095, + "language_loss": 0.72608238, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74903685, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.485992193222046 + }, + { + "auxiliary_loss_clip": 0.01138668, + "auxiliary_loss_mlp": 0.011262, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00079226, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.6958397375070746, + "language_loss": 0.71444285, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.7370916, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.6425304412841797 + }, + { + "auxiliary_loss_clip": 0.01170677, + "auxiliary_loss_mlp": 0.01126686, + "balance_loss_clip": 1.00213969, + "balance_loss_mlp": 1.00089633, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.7869174713549523, + "language_loss": 0.75615227, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77912593, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 3.932612657546997 + }, + { + "auxiliary_loss_clip": 0.01155524, + "auxiliary_loss_mlp": 0.0112637, + "balance_loss_clip": 1.00201523, + "balance_loss_mlp": 1.00067592, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.320200322522288, + "language_loss": 0.66376162, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68658054, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.5179460048675537 + }, + { + "auxiliary_loss_clip": 0.01155327, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_clip": 1.00203598, + "balance_loss_mlp": 1.00065351, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 5.190886435820959, + "language_loss": 0.8563062, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87912965, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 3.9062814712524414 + }, + { + "auxiliary_loss_clip": 0.01153874, + "auxiliary_loss_mlp": 0.0112615, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00064611, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 1.9975247194679377, + "language_loss": 0.84886265, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87166286, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 2.540769100189209 + }, + { + "auxiliary_loss_clip": 0.01154076, + "auxiliary_loss_mlp": 0.01126651, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00067055, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.6521150695211628, + "language_loss": 0.78124779, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80405509, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.5543594360351562 + }, + { + "auxiliary_loss_clip": 0.01153899, + "auxiliary_loss_mlp": 0.01127477, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00082958, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.7443727223241974, + "language_loss": 0.73649263, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75930643, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.583491086959839 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01127066, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00108612, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.4280907982545712, + "language_loss": 0.75868547, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78151309, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.5547335147857666 + }, + { + "auxiliary_loss_clip": 0.0112289, + "auxiliary_loss_mlp": 0.01126267, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00085914, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 2.3357540242767287, + "language_loss": 0.82195091, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84444249, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.717534303665161 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01108663, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00032544, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.6966953401540872, + "language_loss": 0.53292495, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55541915, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.3484947681427 + }, + { + "auxiliary_loss_clip": 0.01104889, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_clip": 1.00179923, + "balance_loss_mlp": 1.00082898, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.5359486744392883, + "language_loss": 0.88781387, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.910128, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.736440896987915 + }, + { + "auxiliary_loss_clip": 0.01155395, + "auxiliary_loss_mlp": 0.01126711, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00063562, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.809659913260283, + "language_loss": 0.79257214, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81539321, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.5576488971710205 + }, + { + "auxiliary_loss_clip": 0.01123882, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_clip": 1.00191057, + "balance_loss_mlp": 1.00061345, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 1.940861657230135, + "language_loss": 0.6312865, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65378737, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.6283211708068848 + }, + { + "auxiliary_loss_clip": 0.01137207, + "auxiliary_loss_mlp": 0.00748352, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00165665, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.384114634812289, + "language_loss": 0.80450106, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82335663, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.654878616333008 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.0112659, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00060987, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.4801956101389173, + "language_loss": 0.82660866, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84892112, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.62841534614563 + }, + { + "auxiliary_loss_clip": 0.01155354, + "auxiliary_loss_mlp": 0.01127089, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00072718, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.633280182345365, + "language_loss": 0.93039763, + "learning_rate": 2.71939546536012e-06, + "loss": 0.9532221, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.6658005714416504 + }, + { + "auxiliary_loss_clip": 0.01153853, + "auxiliary_loss_mlp": 0.011275, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00066161, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 1.8630007068024352, + "language_loss": 0.79553604, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81834948, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.55245304107666 + }, + { + "auxiliary_loss_clip": 0.0114313, + "auxiliary_loss_mlp": 0.01126776, + "balance_loss_clip": 1.00229597, + "balance_loss_mlp": 1.00079608, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 2.1698737276747964, + "language_loss": 0.83944499, + "learning_rate": 2.71866862166691e-06, + "loss": 0.86214399, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.599522352218628 + }, + { + "auxiliary_loss_clip": 0.01170423, + "auxiliary_loss_mlp": 0.01126193, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00068915, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.4332745951323234, + "language_loss": 0.63772619, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66069227, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.482064723968506 + }, + { + "auxiliary_loss_clip": 0.01138548, + "auxiliary_loss_mlp": 0.01125701, + "balance_loss_clip": 1.00200212, + "balance_loss_mlp": 1.00076926, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.5007504408566952, + "language_loss": 0.7908228, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81346524, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.673086643218994 + }, + { + "auxiliary_loss_clip": 0.01121916, + "auxiliary_loss_mlp": 0.00748502, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00181127, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.7738741906657598, + "language_loss": 0.75788474, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77658892, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.6229147911071777 + }, + { + "auxiliary_loss_clip": 0.0111263, + "auxiliary_loss_mlp": 0.01127606, + "balance_loss_clip": 1.00218368, + "balance_loss_mlp": 1.00057626, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.8567168770931666, + "language_loss": 0.64034247, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66274482, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.672424077987671 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01126112, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00070405, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.934447444809068, + "language_loss": 0.73305452, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75537813, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.7162396907806396 + }, + { + "auxiliary_loss_clip": 0.01153691, + "auxiliary_loss_mlp": 0.01126593, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00089908, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.9066344753673128, + "language_loss": 0.72985101, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75265384, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.6003119945526123 + }, + { + "auxiliary_loss_clip": 0.01156539, + "auxiliary_loss_mlp": 0.01108479, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00014138, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8052573532127527, + "language_loss": 0.60433084, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62698102, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.2383267879486084 + }, + { + "auxiliary_loss_clip": 0.01159903, + "auxiliary_loss_mlp": 0.01127145, + "balance_loss_clip": 1.00221646, + "balance_loss_mlp": 1.00068831, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.6828568123657328, + "language_loss": 0.70171225, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72458273, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.522731065750122 + }, + { + "auxiliary_loss_clip": 0.01137264, + "auxiliary_loss_mlp": 0.01125874, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00075209, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.3941174517852615, + "language_loss": 0.74831235, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77094376, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.6115074157714844 + }, + { + "auxiliary_loss_clip": 0.0113723, + "auxiliary_loss_mlp": 0.01127018, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00065649, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.7480675498345914, + "language_loss": 0.71121001, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73385245, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.5994083881378174 + }, + { + "auxiliary_loss_clip": 0.01138606, + "auxiliary_loss_mlp": 0.01128008, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00078797, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.6189884325230874, + "language_loss": 0.64375931, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66642547, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.627990245819092 + }, + { + "auxiliary_loss_clip": 0.01154997, + "auxiliary_loss_mlp": 0.01126968, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00060606, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.1975525217949063, + "language_loss": 0.7334345, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75625414, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.5156428813934326 + }, + { + "auxiliary_loss_clip": 0.0112356, + "auxiliary_loss_mlp": 0.01126588, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00079823, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.5144980304137519, + "language_loss": 0.74889928, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77140081, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 4.092395067214966 + }, + { + "auxiliary_loss_clip": 0.01138755, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00080705, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5024102770741818, + "language_loss": 0.72105718, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74372119, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.5883729457855225 + }, + { + "auxiliary_loss_clip": 0.01106742, + "auxiliary_loss_mlp": 0.01126581, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00079155, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.8708523167739008, + "language_loss": 0.84000051, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86233377, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.693753719329834 + }, + { + "auxiliary_loss_clip": 0.01122106, + "auxiliary_loss_mlp": 0.01127033, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00095785, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.6923218448835482, + "language_loss": 0.71233737, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73482877, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.761216640472412 + }, + { + "auxiliary_loss_clip": 0.01137403, + "auxiliary_loss_mlp": 0.01126977, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00099683, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 1.9231875306744142, + "language_loss": 0.68219435, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.70483816, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.5964081287384033 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01126609, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00072432, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 1.9144346117691153, + "language_loss": 0.79318738, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81583774, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 4.393254280090332 + }, + { + "auxiliary_loss_clip": 0.01138576, + "auxiliary_loss_mlp": 0.01127267, + "balance_loss_clip": 1.00199616, + "balance_loss_mlp": 1.00080991, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.838957129303364, + "language_loss": 0.71043217, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73309058, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.5913383960723877 + }, + { + "auxiliary_loss_clip": 0.01155146, + "auxiliary_loss_mlp": 0.01126729, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00103498, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 5.02972203188639, + "language_loss": 0.61352468, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63634336, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.592097520828247 + }, + { + "auxiliary_loss_clip": 0.01153791, + "auxiliary_loss_mlp": 0.01125972, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00075471, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.9527879001171953, + "language_loss": 0.77080065, + "learning_rate": 2.711030202621491e-06, + "loss": 0.79359829, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.530827522277832 + }, + { + "auxiliary_loss_clip": 0.01121602, + "auxiliary_loss_mlp": 0.01126252, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00055778, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5894564128370383, + "language_loss": 0.80151045, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82398903, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 4.043520927429199 + }, + { + "auxiliary_loss_clip": 0.01137411, + "auxiliary_loss_mlp": 0.01127123, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00076091, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 1.8682786708933066, + "language_loss": 0.74289632, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.76554167, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 4.099162340164185 + }, + { + "auxiliary_loss_clip": 0.01138736, + "auxiliary_loss_mlp": 0.01126528, + "balance_loss_clip": 1.00172663, + "balance_loss_mlp": 1.00073791, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.4202100364254442, + "language_loss": 0.66096085, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68361342, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.6331241130828857 + }, + { + "auxiliary_loss_clip": 0.01138985, + "auxiliary_loss_mlp": 0.01126602, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00081289, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.5396193547640957, + "language_loss": 0.65917438, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68183029, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.584326982498169 + }, + { + "auxiliary_loss_clip": 0.01075023, + "auxiliary_loss_mlp": 0.01126668, + "balance_loss_clip": 1.0016377, + "balance_loss_mlp": 1.00068808, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 3.0921124093838643, + "language_loss": 0.82230824, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84432518, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.7677111625671387 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01126731, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00065506, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.6625336525370065, + "language_loss": 0.7332381, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75593269, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.579784393310547 + }, + { + "auxiliary_loss_clip": 0.01153518, + "auxiliary_loss_mlp": 0.01126205, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00079668, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.86222014812436, + "language_loss": 0.66758674, + "learning_rate": 2.708481414320713e-06, + "loss": 0.69038391, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.5748748779296875 + }, + { + "auxiliary_loss_clip": 0.01153747, + "auxiliary_loss_mlp": 0.01126568, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.0008738, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.749319110882128, + "language_loss": 0.71315265, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73595583, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.613126516342163 + }, + { + "auxiliary_loss_clip": 0.01136516, + "auxiliary_loss_mlp": 0.01124898, + "balance_loss_clip": 1.00170612, + "balance_loss_mlp": 1.00044394, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.6710124333215572, + "language_loss": 0.79617631, + "learning_rate": 2.707752947093611e-06, + "loss": 0.81879044, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.668658971786499 + }, + { + "auxiliary_loss_clip": 0.01105613, + "auxiliary_loss_mlp": 0.01126701, + "balance_loss_clip": 1.00172389, + "balance_loss_mlp": 1.00081587, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.325012182547576, + "language_loss": 0.83346474, + "learning_rate": 2.70738867321606e-06, + "loss": 0.85578787, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.6067960262298584 + }, + { + "auxiliary_loss_clip": 0.01153712, + "auxiliary_loss_mlp": 0.01127385, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00083232, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.5236257498239016, + "language_loss": 0.71145654, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73426747, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.6407785415649414 + }, + { + "auxiliary_loss_clip": 0.01138854, + "auxiliary_loss_mlp": 0.01126466, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00086653, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.085874470834911, + "language_loss": 0.84604931, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.86870241, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.544420003890991 + }, + { + "auxiliary_loss_clip": 0.011554, + "auxiliary_loss_mlp": 0.01126232, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00072837, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 2.1121851139196957, + "language_loss": 0.75984442, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78266084, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.5571486949920654 + }, + { + "auxiliary_loss_clip": 0.01145185, + "auxiliary_loss_mlp": 0.01126563, + "balance_loss_clip": 1.00228548, + "balance_loss_mlp": 1.00077379, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 1.9074501853727632, + "language_loss": 0.79161251, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81432998, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.6564948558807373 + }, + { + "auxiliary_loss_clip": 0.01124607, + "auxiliary_loss_mlp": 0.01126252, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00074804, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8624584949284722, + "language_loss": 0.87863624, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90114486, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.6039388179779053 + }, + { + "auxiliary_loss_clip": 0.01153727, + "auxiliary_loss_mlp": 0.01125921, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00070381, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.4959821413797987, + "language_loss": 0.69246924, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71526575, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.5319831371307373 + }, + { + "auxiliary_loss_clip": 0.01106429, + "auxiliary_loss_mlp": 0.0112641, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00071609, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.0815182448938776, + "language_loss": 0.77498758, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79731607, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.6346421241760254 + }, + { + "auxiliary_loss_clip": 0.01106336, + "auxiliary_loss_mlp": 0.01125598, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00085735, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.783309380408738, + "language_loss": 0.76435906, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78667843, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.628924608230591 + }, + { + "auxiliary_loss_clip": 0.01139874, + "auxiliary_loss_mlp": 0.01107729, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00015438, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9363622563360746, + "language_loss": 0.60726261, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62973863, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 3.004939556121826 + }, + { + "auxiliary_loss_clip": 0.01170536, + "auxiliary_loss_mlp": 0.01126711, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00082612, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 1.7915777282646717, + "language_loss": 0.7451939, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76816636, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.496021032333374 + }, + { + "auxiliary_loss_clip": 0.01155443, + "auxiliary_loss_mlp": 0.0112627, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00095713, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.149609930522788, + "language_loss": 0.80942297, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83224016, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.588106155395508 + }, + { + "auxiliary_loss_clip": 0.0113659, + "auxiliary_loss_mlp": 0.01126341, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00064707, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.8955180883334364, + "language_loss": 0.76793265, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79056191, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.556802749633789 + }, + { + "auxiliary_loss_clip": 0.01121404, + "auxiliary_loss_mlp": 0.01125725, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00069857, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.6303322673837404, + "language_loss": 0.72532797, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74779928, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.6596317291259766 + }, + { + "auxiliary_loss_clip": 0.01153657, + "auxiliary_loss_mlp": 0.01125786, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00075924, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7581627277183685, + "language_loss": 0.65285015, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67564464, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.5309407711029053 + }, + { + "auxiliary_loss_clip": 0.01154181, + "auxiliary_loss_mlp": 0.01127138, + "balance_loss_clip": 1.00208247, + "balance_loss_mlp": 1.00077629, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.419796739136194, + "language_loss": 0.73754895, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76036209, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.5545380115509033 + }, + { + "auxiliary_loss_clip": 0.01137289, + "auxiliary_loss_mlp": 0.0112504, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.0006814, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.959627516470749, + "language_loss": 0.74805188, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.77067518, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.639679193496704 + }, + { + "auxiliary_loss_clip": 0.01153571, + "auxiliary_loss_mlp": 0.01126186, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00077796, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.4930865137836526, + "language_loss": 0.76496148, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78775907, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.743722438812256 + }, + { + "auxiliary_loss_clip": 0.0113959, + "auxiliary_loss_mlp": 0.00748439, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00174677, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 1.9246119525197987, + "language_loss": 0.81440997, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83329028, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.566988945007324 + }, + { + "auxiliary_loss_clip": 0.01170441, + "auxiliary_loss_mlp": 0.01125159, + "balance_loss_clip": 1.00203562, + "balance_loss_mlp": 1.00060892, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 1.9571185157398125, + "language_loss": 0.85289025, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87584627, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.480447769165039 + }, + { + "auxiliary_loss_clip": 0.01122201, + "auxiliary_loss_mlp": 0.01126123, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00090563, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.673635825186972, + "language_loss": 0.82092774, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84341097, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 4.160146474838257 + }, + { + "auxiliary_loss_clip": 0.01138072, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00080454, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 2.189840873250871, + "language_loss": 0.73689866, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75954723, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.621551275253296 + }, + { + "auxiliary_loss_clip": 0.01155404, + "auxiliary_loss_mlp": 0.01126292, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00088394, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.7603147891704711, + "language_loss": 0.67728126, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70009828, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.6902294158935547 + }, + { + "auxiliary_loss_clip": 0.01170428, + "auxiliary_loss_mlp": 0.01125708, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.00087237, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 2.8282295610832877, + "language_loss": 0.73909318, + "learning_rate": 2.699002998510517e-06, + "loss": 0.7620545, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.5371313095092773 + }, + { + "auxiliary_loss_clip": 0.01137387, + "auxiliary_loss_mlp": 0.00748274, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00158918, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.7664361250442822, + "language_loss": 0.77465016, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79350674, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.5609312057495117 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.01125932, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00080991, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.947966858920717, + "language_loss": 0.76731122, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78997087, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 4.065882921218872 + }, + { + "auxiliary_loss_clip": 0.01138581, + "auxiliary_loss_mlp": 0.01126307, + "balance_loss_clip": 1.00198638, + "balance_loss_mlp": 1.0007081, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.350789669892696, + "language_loss": 0.65177429, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.6744231, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.582038164138794 + }, + { + "auxiliary_loss_clip": 0.0112264, + "auxiliary_loss_mlp": 0.01125412, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.00067174, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.840360142246923, + "language_loss": 0.83119059, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85367113, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.6185972690582275 + }, + { + "auxiliary_loss_clip": 0.01122481, + "auxiliary_loss_mlp": 0.00748433, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.0017271, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.5696391445363074, + "language_loss": 0.7498877, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.76859683, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.6612157821655273 + }, + { + "auxiliary_loss_clip": 0.01153579, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00093877, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.4834154879524606, + "language_loss": 0.7152741, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73806101, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 3.928914785385132 + }, + { + "auxiliary_loss_clip": 0.01121707, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.00066257, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 1.7393160572042963, + "language_loss": 0.74723977, + "learning_rate": 2.696448045740828e-06, + "loss": 0.76970232, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 4.008694887161255 + }, + { + "auxiliary_loss_clip": 0.01124007, + "auxiliary_loss_mlp": 0.01126133, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00072443, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.721808202181651, + "language_loss": 0.73989177, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76239312, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.6795496940612793 + }, + { + "auxiliary_loss_clip": 0.01155216, + "auxiliary_loss_mlp": 0.01125291, + "balance_loss_clip": 1.00210059, + "balance_loss_mlp": 1.00064588, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.68236124258816, + "language_loss": 0.77344388, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79624891, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.5409672260284424 + }, + { + "auxiliary_loss_clip": 0.01170277, + "auxiliary_loss_mlp": 0.01125862, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00074053, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 2.4098844884732666, + "language_loss": 0.71304435, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73600572, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.5037996768951416 + }, + { + "auxiliary_loss_clip": 0.01170481, + "auxiliary_loss_mlp": 0.01125789, + "balance_loss_clip": 1.00206137, + "balance_loss_mlp": 1.00066674, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 1.980863787695146, + "language_loss": 0.72539842, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74836111, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.4371399879455566 + }, + { + "auxiliary_loss_clip": 0.01139956, + "auxiliary_loss_mlp": 0.01126831, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00075531, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.7033433174946255, + "language_loss": 0.70613521, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72880304, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.572016716003418 + }, + { + "auxiliary_loss_clip": 0.01153502, + "auxiliary_loss_mlp": 0.01125114, + "balance_loss_clip": 1.0018779, + "balance_loss_mlp": 1.0006597, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.573054333467917, + "language_loss": 0.80158305, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82436919, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.580915689468384 + }, + { + "auxiliary_loss_clip": 0.01144331, + "auxiliary_loss_mlp": 0.01125923, + "balance_loss_clip": 1.00256228, + "balance_loss_mlp": 1.00089622, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.9513258706702803, + "language_loss": 0.67136621, + "learning_rate": 2.693891798911731e-06, + "loss": 0.69406867, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.581552028656006 + }, + { + "auxiliary_loss_clip": 0.011221, + "auxiliary_loss_mlp": 0.01126236, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00082827, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.359013184561542, + "language_loss": 0.5698508, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59233409, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.809199810028076 + }, + { + "auxiliary_loss_clip": 0.01126463, + "auxiliary_loss_mlp": 0.01126231, + "balance_loss_clip": 1.00237906, + "balance_loss_mlp": 1.00082326, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.693985510618304, + "language_loss": 0.84472889, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86725581, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.6933505535125732 + }, + { + "auxiliary_loss_clip": 0.01137491, + "auxiliary_loss_mlp": 0.01126351, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00084782, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 5.716258541352656, + "language_loss": 0.81397378, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83661222, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.5557663440704346 + }, + { + "auxiliary_loss_clip": 0.01155255, + "auxiliary_loss_mlp": 0.00748489, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00166059, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.5374105270029985, + "language_loss": 0.75294143, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77197886, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.578650951385498 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01126207, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00070369, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.065883342297255, + "language_loss": 0.7394281, + "learning_rate": 2.692065118669195e-06, + "loss": 0.7620753, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.5716490745544434 + }, + { + "auxiliary_loss_clip": 0.01112148, + "auxiliary_loss_mlp": 0.01126179, + "balance_loss_clip": 1.00224721, + "balance_loss_mlp": 1.00067592, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.603518593958257, + "language_loss": 0.66859138, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69097471, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.711775779724121 + }, + { + "auxiliary_loss_clip": 0.01105686, + "auxiliary_loss_mlp": 0.01126867, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00098252, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.631748275548204, + "language_loss": 0.70614541, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72847092, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.945535182952881 + }, + { + "auxiliary_loss_clip": 0.01140196, + "auxiliary_loss_mlp": 0.01126722, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00083685, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 1.7389410904457936, + "language_loss": 0.71856976, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74123889, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.55830979347229 + }, + { + "auxiliary_loss_clip": 0.0112776, + "auxiliary_loss_mlp": 0.0112663, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00084078, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7053586855428449, + "language_loss": 0.82844889, + "learning_rate": 2.690603302014844e-06, + "loss": 0.8509928, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.6186063289642334 + }, + { + "auxiliary_loss_clip": 0.01112205, + "auxiliary_loss_mlp": 0.01126129, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00062513, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.7189951504183671, + "language_loss": 0.70719099, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72957432, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.721240758895874 + }, + { + "auxiliary_loss_clip": 0.01093005, + "auxiliary_loss_mlp": 0.00748537, + "balance_loss_clip": 1.00182045, + "balance_loss_mlp": 1.00171947, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.6393517614521924, + "language_loss": 0.78811789, + "learning_rate": 2.689872236505755e-06, + "loss": 0.80653334, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 2.740825653076172 + }, + { + "auxiliary_loss_clip": 0.01136968, + "auxiliary_loss_mlp": 0.01126122, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.0006187, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.6096260148655885, + "language_loss": 0.78577834, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80840927, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.590587854385376 + }, + { + "auxiliary_loss_clip": 0.01120553, + "auxiliary_loss_mlp": 0.01126183, + "balance_loss_clip": 1.0019784, + "balance_loss_mlp": 1.0007745, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.0863765505320058, + "language_loss": 0.88837898, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91084635, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.5924158096313477 + }, + { + "auxiliary_loss_clip": 0.01126219, + "auxiliary_loss_mlp": 0.01125311, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.00066578, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 1.6585223691325863, + "language_loss": 0.64121068, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66372597, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.01155643, + "auxiliary_loss_mlp": 0.01126605, + "balance_loss_clip": 1.00209808, + "balance_loss_mlp": 1.00072002, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.3811287180578722, + "language_loss": 0.75180197, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77462447, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.6185598373413086 + }, + { + "auxiliary_loss_clip": 0.01137373, + "auxiliary_loss_mlp": 0.01125351, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.00070548, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4628170336267279, + "language_loss": 0.701612, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72423923, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.587071657180786 + }, + { + "auxiliary_loss_clip": 0.01154176, + "auxiliary_loss_mlp": 0.01126579, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00098002, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 3.0997727467358995, + "language_loss": 0.73289937, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75570691, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.578766345977783 + }, + { + "auxiliary_loss_clip": 0.01125299, + "auxiliary_loss_mlp": 0.01126475, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00068593, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 2.1105932794432203, + "language_loss": 0.69124401, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71376175, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.5634520053863525 + }, + { + "auxiliary_loss_clip": 0.01124332, + "auxiliary_loss_mlp": 0.01127696, + "balance_loss_clip": 1.00193489, + "balance_loss_mlp": 1.00095248, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.1668743910417274, + "language_loss": 0.90908587, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93160617, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.6674320697784424 + }, + { + "auxiliary_loss_clip": 0.01155577, + "auxiliary_loss_mlp": 0.01127025, + "balance_loss_clip": 1.00218558, + "balance_loss_mlp": 1.00075889, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 3.2564361834320654, + "language_loss": 0.79081917, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.81364524, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 4.004404067993164 + }, + { + "auxiliary_loss_clip": 0.01170475, + "auxiliary_loss_mlp": 0.01126285, + "balance_loss_clip": 1.00194001, + "balance_loss_mlp": 1.00087631, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 2.73865555823268, + "language_loss": 0.76349139, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78645897, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.479325771331787 + }, + { + "auxiliary_loss_clip": 0.01154043, + "auxiliary_loss_mlp": 0.01126139, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00073051, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.6736578964304365, + "language_loss": 0.77291602, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79571784, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.599891424179077 + }, + { + "auxiliary_loss_clip": 0.01170539, + "auxiliary_loss_mlp": 0.01125708, + "balance_loss_clip": 1.00211239, + "balance_loss_mlp": 1.00068116, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.0038162775430144, + "language_loss": 0.87149382, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89445627, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.5038576126098633 + }, + { + "auxiliary_loss_clip": 0.01138532, + "auxiliary_loss_mlp": 0.01126043, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00092125, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.9622137690066412, + "language_loss": 0.81130487, + "learning_rate": 2.685117765051156e-06, + "loss": 0.8339507, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.63934588432312 + }, + { + "auxiliary_loss_clip": 0.01170623, + "auxiliary_loss_mlp": 0.01126557, + "balance_loss_clip": 1.00212932, + "balance_loss_mlp": 1.00086319, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.7153075429661813, + "language_loss": 0.80016136, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82313311, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 3.965642213821411 + }, + { + "auxiliary_loss_clip": 0.01123052, + "auxiliary_loss_mlp": 0.01125544, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00080323, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.375055831773422, + "language_loss": 0.76225674, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78474271, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.67374587059021 + }, + { + "auxiliary_loss_clip": 0.01136959, + "auxiliary_loss_mlp": 0.01126357, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00085378, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.8481993910374594, + "language_loss": 0.81475377, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83738685, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.5828826427459717 + }, + { + "auxiliary_loss_clip": 0.01136989, + "auxiliary_loss_mlp": 0.01107626, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00005138, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8316219082777598, + "language_loss": 0.64342141, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66586757, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.0649986267089844 + }, + { + "auxiliary_loss_clip": 0.01104646, + "auxiliary_loss_mlp": 0.01126711, + "balance_loss_clip": 1.00191057, + "balance_loss_mlp": 1.00073087, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 1.7512257107977123, + "language_loss": 0.72660136, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74891496, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.7139346599578857 + }, + { + "auxiliary_loss_clip": 0.01143488, + "auxiliary_loss_mlp": 0.00748527, + "balance_loss_clip": 1.00240076, + "balance_loss_mlp": 1.00170135, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3881462482516052, + "language_loss": 0.77903521, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79795539, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 4.021945476531982 + }, + { + "auxiliary_loss_clip": 0.01153967, + "auxiliary_loss_mlp": 0.01127396, + "balance_loss_clip": 1.00213313, + "balance_loss_mlp": 1.00084376, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.349284979381787, + "language_loss": 0.79219961, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81501329, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 4.024527072906494 + }, + { + "auxiliary_loss_clip": 0.01166961, + "auxiliary_loss_mlp": 0.01108434, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.0000968, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6812569689579668, + "language_loss": 0.53091079, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55366474, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.095707893371582 + }, + { + "auxiliary_loss_clip": 0.01170681, + "auxiliary_loss_mlp": 0.00748374, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00175583, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 1.9043722569594248, + "language_loss": 0.82494915, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84413964, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.508915901184082 + }, + { + "auxiliary_loss_clip": 0.01155586, + "auxiliary_loss_mlp": 0.01126522, + "balance_loss_clip": 1.0022285, + "balance_loss_mlp": 1.00082779, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.494527835170042, + "language_loss": 0.76441461, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78723568, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.602569341659546 + }, + { + "auxiliary_loss_clip": 0.01153668, + "auxiliary_loss_mlp": 0.01125816, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.00088441, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 1.8413975968495822, + "language_loss": 0.66196221, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68475705, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.5131819248199463 + }, + { + "auxiliary_loss_clip": 0.01138815, + "auxiliary_loss_mlp": 0.0112565, + "balance_loss_clip": 1.00200117, + "balance_loss_mlp": 1.00062299, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.5532292898632156, + "language_loss": 0.71178836, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.734433, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.7200989723205566 + }, + { + "auxiliary_loss_clip": 0.01155378, + "auxiliary_loss_mlp": 0.01126524, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00054348, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.8056781370562125, + "language_loss": 0.8210547, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84387374, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.52449893951416 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01126773, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00069737, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.5507307087311286, + "language_loss": 0.80875731, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83156645, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.6866049766540527 + }, + { + "auxiliary_loss_clip": 0.01122318, + "auxiliary_loss_mlp": 0.01127473, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00073016, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.853027448249101, + "language_loss": 0.65701032, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67950821, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.656216859817505 + }, + { + "auxiliary_loss_clip": 0.01154072, + "auxiliary_loss_mlp": 0.01126226, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00062692, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.302778508276566, + "language_loss": 0.79751015, + "learning_rate": 2.679260083800989e-06, + "loss": 0.8203131, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.522289514541626 + }, + { + "auxiliary_loss_clip": 0.011706, + "auxiliary_loss_mlp": 0.0112611, + "balance_loss_clip": 1.00216174, + "balance_loss_mlp": 1.00098801, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.581708667360442, + "language_loss": 0.8179791, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84094626, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.4936704635620117 + }, + { + "auxiliary_loss_clip": 0.01153354, + "auxiliary_loss_mlp": 0.01126241, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.0006417, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.8764714686429, + "language_loss": 0.67824608, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70104206, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.528238296508789 + }, + { + "auxiliary_loss_clip": 0.01138443, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00075305, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 1.6708777976694995, + "language_loss": 0.66282409, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68546438, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.741117000579834 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01125837, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00071502, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.8389207772499865, + "language_loss": 0.60265398, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62498254, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.649998188018799 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01126605, + "balance_loss_clip": 1.00222588, + "balance_loss_mlp": 1.00091076, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 2.7167159955131375, + "language_loss": 0.69602752, + "learning_rate": 2.677428203462683e-06, + "loss": 0.7188499, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.5025904178619385 + }, + { + "auxiliary_loss_clip": 0.011502, + "auxiliary_loss_mlp": 0.01107725, + "balance_loss_clip": 1.00141788, + "balance_loss_mlp": 1.00015056, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7405276868581538, + "language_loss": 0.59622717, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61880636, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.1645100116729736 + }, + { + "auxiliary_loss_clip": 0.01170604, + "auxiliary_loss_mlp": 0.01126798, + "balance_loss_clip": 1.00221825, + "balance_loss_mlp": 1.00081778, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.5694600429767782, + "language_loss": 0.80404073, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82701474, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.4987597465515137 + }, + { + "auxiliary_loss_clip": 0.01154172, + "auxiliary_loss_mlp": 0.01126791, + "balance_loss_clip": 1.0020957, + "balance_loss_mlp": 1.0008105, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.73162201697178, + "language_loss": 0.84892899, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87173861, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.5884110927581787 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01126601, + "balance_loss_clip": 1.00195277, + "balance_loss_mlp": 1.00081182, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.6266133603447972, + "language_loss": 0.80161047, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82409596, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.63004207611084 + }, + { + "auxiliary_loss_clip": 0.01154205, + "auxiliary_loss_mlp": 0.01127094, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.00073218, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.330735860825478, + "language_loss": 0.70074242, + "learning_rate": 2.675595680920792e-06, + "loss": 0.72355539, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 2.500491142272949 + }, + { + "auxiliary_loss_clip": 0.01155258, + "auxiliary_loss_mlp": 0.00748541, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00168288, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.678379243947487, + "language_loss": 0.77857262, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.79761064, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.5651299953460693 + }, + { + "auxiliary_loss_clip": 0.01155348, + "auxiliary_loss_mlp": 0.01126661, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.0010618, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.6919983874793858, + "language_loss": 0.85459125, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87741137, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.490328550338745 + }, + { + "auxiliary_loss_clip": 0.01170661, + "auxiliary_loss_mlp": 0.01125272, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.0008173, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.7616334002200305, + "language_loss": 0.84456402, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86752331, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.5245113372802734 + }, + { + "auxiliary_loss_clip": 0.01123714, + "auxiliary_loss_mlp": 0.01126789, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00080848, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.118164441382936, + "language_loss": 0.83299911, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85550416, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.601442575454712 + }, + { + "auxiliary_loss_clip": 0.01153725, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.00097716, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.7608255082081823, + "language_loss": 0.74751484, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.77031785, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.5289738178253174 + }, + { + "auxiliary_loss_clip": 0.01154985, + "auxiliary_loss_mlp": 0.01126157, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.0007484, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 2.3684247612790807, + "language_loss": 0.79864907, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82146049, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.4953160285949707 + }, + { + "auxiliary_loss_clip": 0.01154453, + "auxiliary_loss_mlp": 0.01127325, + "balance_loss_clip": 1.00228119, + "balance_loss_mlp": 1.00096321, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.4465521664863696, + "language_loss": 0.7603004, + "learning_rate": 2.673029073767934e-06, + "loss": 0.78311825, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.5329861640930176 + }, + { + "auxiliary_loss_clip": 0.010897, + "auxiliary_loss_mlp": 0.00748487, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00156283, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 2.1564508669816314, + "language_loss": 0.78664732, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80502915, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 4.175090551376343 + }, + { + "auxiliary_loss_clip": 0.01170623, + "auxiliary_loss_mlp": 0.01126797, + "balance_loss_clip": 1.00210953, + "balance_loss_mlp": 1.00091219, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.7389215131556115, + "language_loss": 0.74940342, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77237767, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.542407989501953 + }, + { + "auxiliary_loss_clip": 0.01106523, + "auxiliary_loss_mlp": 0.01126654, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00096011, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.546826822705444, + "language_loss": 0.79322135, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81555313, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.6835477352142334 + }, + { + "auxiliary_loss_clip": 0.01154141, + "auxiliary_loss_mlp": 0.01126356, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.00056624, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 2.165875600022275, + "language_loss": 0.72048509, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74329001, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.570344924926758 + }, + { + "auxiliary_loss_clip": 0.0114028, + "auxiliary_loss_mlp": 0.0110839, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00005245, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8189071261061602, + "language_loss": 0.58788192, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.61036861, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.285855770111084 + }, + { + "auxiliary_loss_clip": 0.01139784, + "auxiliary_loss_mlp": 0.01126135, + "balance_loss_clip": 1.00210786, + "balance_loss_mlp": 1.00082254, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.4952463426399394, + "language_loss": 0.54662442, + "learning_rate": 2.670828129267242e-06, + "loss": 0.5692836, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 4.064679145812988 + }, + { + "auxiliary_loss_clip": 0.01140306, + "auxiliary_loss_mlp": 0.01126423, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00063372, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.7594440679809287, + "language_loss": 0.83270508, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85537231, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.59312105178833 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01127391, + "balance_loss_clip": 1.00222504, + "balance_loss_mlp": 1.00102925, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.0096083453807405, + "language_loss": 0.77101481, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79367554, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.592597246170044 + }, + { + "auxiliary_loss_clip": 0.01170651, + "auxiliary_loss_mlp": 0.01126428, + "balance_loss_clip": 1.00223446, + "balance_loss_mlp": 1.00073409, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.5150172489661013, + "language_loss": 0.7019031, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72487396, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.4804539680480957 + }, + { + "auxiliary_loss_clip": 0.01170607, + "auxiliary_loss_mlp": 0.01126554, + "balance_loss_clip": 1.00213265, + "balance_loss_mlp": 1.00085986, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 6.40059781383533, + "language_loss": 0.66262054, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68559217, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 3.971726417541504 + }, + { + "auxiliary_loss_clip": 0.01155614, + "auxiliary_loss_mlp": 0.00748453, + "balance_loss_clip": 1.00212276, + "balance_loss_mlp": 1.00151348, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.713192270246736, + "language_loss": 0.7432735, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.7623142, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.6065428256988525 + }, + { + "auxiliary_loss_clip": 0.01107653, + "auxiliary_loss_mlp": 0.01126905, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.0006386, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 5.168020520868681, + "language_loss": 0.65855092, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68089652, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 4.106940507888794 + }, + { + "auxiliary_loss_clip": 0.01153835, + "auxiliary_loss_mlp": 0.01126065, + "balance_loss_clip": 1.00216031, + "balance_loss_mlp": 1.00094295, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.630312694917892, + "language_loss": 0.76445675, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78725582, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.5561280250549316 + }, + { + "auxiliary_loss_clip": 0.01139999, + "auxiliary_loss_mlp": 0.01126372, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00105906, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 1.930934637357393, + "language_loss": 0.81887698, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.84154069, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.53135347366333 + }, + { + "auxiliary_loss_clip": 0.01139203, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00074577, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 4.122753306842292, + "language_loss": 0.80444849, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82710683, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.620246648788452 + }, + { + "auxiliary_loss_clip": 0.01124649, + "auxiliary_loss_mlp": 0.01125603, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00067115, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.497116502517458, + "language_loss": 0.66171092, + "learning_rate": 2.66715785488769e-06, + "loss": 0.6842134, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.6792192459106445 + }, + { + "auxiliary_loss_clip": 0.01138965, + "auxiliary_loss_mlp": 0.01127563, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00091529, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 2.4848752185058482, + "language_loss": 0.84845567, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87112093, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.616499185562134 + }, + { + "auxiliary_loss_clip": 0.01154196, + "auxiliary_loss_mlp": 0.01126048, + "balance_loss_clip": 1.00216413, + "balance_loss_mlp": 1.00092626, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.6232001979679889, + "language_loss": 0.71191102, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73471349, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.584029197692871 + }, + { + "auxiliary_loss_clip": 0.01154069, + "auxiliary_loss_mlp": 0.0112571, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00087357, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.7705923039929001, + "language_loss": 0.74747312, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.77027082, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.533979654312134 + }, + { + "auxiliary_loss_clip": 0.0113831, + "auxiliary_loss_mlp": 0.01126118, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00080514, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.943896954521529, + "language_loss": 0.75732005, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77996433, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.567631483078003 + }, + { + "auxiliary_loss_clip": 0.01103685, + "auxiliary_loss_mlp": 0.01127753, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00100982, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.2837222399791286, + "language_loss": 0.73085207, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75316644, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.687988042831421 + }, + { + "auxiliary_loss_clip": 0.01122104, + "auxiliary_loss_mlp": 0.01126771, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00079072, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 3.1910960207034127, + "language_loss": 0.71899909, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.74148786, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.6485307216644287 + }, + { + "auxiliary_loss_clip": 0.01123988, + "auxiliary_loss_mlp": 0.01126153, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00093555, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9536333266365498, + "language_loss": 0.85383654, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87633801, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.6443800926208496 + }, + { + "auxiliary_loss_clip": 0.01136823, + "auxiliary_loss_mlp": 0.00748456, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00156689, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.6488279823560699, + "language_loss": 0.6635958, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68244857, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.6130213737487793 + }, + { + "auxiliary_loss_clip": 0.0113864, + "auxiliary_loss_mlp": 0.01125984, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00076628, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3379414133221572, + "language_loss": 0.72024512, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74289143, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.5881056785583496 + }, + { + "auxiliary_loss_clip": 0.01139178, + "auxiliary_loss_mlp": 0.01127363, + "balance_loss_clip": 1.00224876, + "balance_loss_mlp": 1.00090611, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 2.501410736564675, + "language_loss": 0.83465934, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85732478, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.5890581607818604 + }, + { + "auxiliary_loss_clip": 0.01153894, + "auxiliary_loss_mlp": 0.01125086, + "balance_loss_clip": 1.00208318, + "balance_loss_mlp": 1.00082183, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5635360070166906, + "language_loss": 0.89807248, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92086232, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.565192937850952 + }, + { + "auxiliary_loss_clip": 0.01122213, + "auxiliary_loss_mlp": 0.01125878, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.00075603, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 1.957945733784211, + "language_loss": 0.65065753, + "learning_rate": 2.662750187431268e-06, + "loss": 0.6731385, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.663174867630005 + }, + { + "auxiliary_loss_clip": 0.011706, + "auxiliary_loss_mlp": 0.01126159, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00065577, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 2.735841174886516, + "language_loss": 0.69450593, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71747351, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.568347930908203 + }, + { + "auxiliary_loss_clip": 0.01109124, + "auxiliary_loss_mlp": 0.01125546, + "balance_loss_clip": 1.002352, + "balance_loss_mlp": 1.00071001, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 1.8535812416905668, + "language_loss": 0.73298705, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75533378, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.656043291091919 + }, + { + "auxiliary_loss_clip": 0.01107006, + "auxiliary_loss_mlp": 0.01127007, + "balance_loss_clip": 1.00210786, + "balance_loss_mlp": 1.000741, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.8928210234503133, + "language_loss": 0.72430742, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.7466476, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.691772699356079 + }, + { + "auxiliary_loss_clip": 0.01153877, + "auxiliary_loss_mlp": 0.01126002, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00078487, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 3.4207242369478754, + "language_loss": 0.71573269, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73853153, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 2.5720129013061523 + }, + { + "auxiliary_loss_clip": 0.01155416, + "auxiliary_loss_mlp": 0.01126228, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00091505, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.7318125182872122, + "language_loss": 0.87004095, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89285737, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.5372138023376465 + }, + { + "auxiliary_loss_clip": 0.01153192, + "auxiliary_loss_mlp": 0.01125271, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00081599, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.8101714047932724, + "language_loss": 0.69069785, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.7134825, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 2.5378105640411377 + }, + { + "auxiliary_loss_clip": 0.01170486, + "auxiliary_loss_mlp": 0.0112624, + "balance_loss_clip": 1.00208414, + "balance_loss_mlp": 1.00073647, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.6269700530611704, + "language_loss": 0.75059509, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77356243, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.507718086242676 + }, + { + "auxiliary_loss_clip": 0.01121744, + "auxiliary_loss_mlp": 0.01126483, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00069284, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 2.293550832592217, + "language_loss": 0.82205307, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84453535, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.6159276962280273 + }, + { + "auxiliary_loss_clip": 0.01170488, + "auxiliary_loss_mlp": 0.01125622, + "balance_loss_clip": 1.00211334, + "balance_loss_mlp": 1.00069082, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 2.015265629144253, + "language_loss": 0.80328852, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.8262496, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.5185184478759766 + }, + { + "auxiliary_loss_clip": 0.01153098, + "auxiliary_loss_mlp": 0.01125026, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00076234, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 2.9983463502307184, + "language_loss": 0.67603767, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69881892, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 3.9783220291137695 + }, + { + "auxiliary_loss_clip": 0.01150711, + "auxiliary_loss_mlp": 0.01107699, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00012457, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.872315120401501, + "language_loss": 0.59669948, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61928356, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.208794593811035 + }, + { + "auxiliary_loss_clip": 0.01153938, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_clip": 1.00200665, + "balance_loss_mlp": 1.00058174, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 1.9904466985506157, + "language_loss": 0.6972369, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72002947, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.5140719413757324 + }, + { + "auxiliary_loss_clip": 0.01117875, + "auxiliary_loss_mlp": 0.0110801, + "balance_loss_clip": 1.00162506, + "balance_loss_mlp": 1.00043571, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7158599430230548, + "language_loss": 0.5363915, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55865026, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.1830742359161377 + }, + { + "auxiliary_loss_clip": 0.01153882, + "auxiliary_loss_mlp": 0.01126006, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.00078881, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.7475591554504497, + "language_loss": 0.66408187, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68688077, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.5229203701019287 + }, + { + "auxiliary_loss_clip": 0.01170544, + "auxiliary_loss_mlp": 0.01125575, + "balance_loss_clip": 1.0022521, + "balance_loss_mlp": 1.00064337, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.9332747917086366, + "language_loss": 0.7013377, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72429883, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.5013060569763184 + }, + { + "auxiliary_loss_clip": 0.01143152, + "auxiliary_loss_mlp": 0.01126008, + "balance_loss_clip": 1.00251651, + "balance_loss_mlp": 1.00069499, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.4775390220342868, + "language_loss": 0.64835238, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67104399, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 4.058349609375 + }, + { + "auxiliary_loss_clip": 0.01142742, + "auxiliary_loss_mlp": 0.01125017, + "balance_loss_clip": 1.00230241, + "balance_loss_mlp": 1.00094414, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.3719217059833997, + "language_loss": 0.70155847, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72423607, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.6971466541290283 + }, + { + "auxiliary_loss_clip": 0.01135908, + "auxiliary_loss_mlp": 0.00746792, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00016582, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8833885743522694, + "language_loss": 0.5627352, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58156222, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.2358288764953613 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01126315, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00071645, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.5430326165507466, + "language_loss": 0.76343232, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78609216, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.01104813, + "auxiliary_loss_mlp": 0.01125445, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00079942, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.6343510435734698, + "language_loss": 0.67644399, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69874662, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 4.224512577056885 + }, + { + "auxiliary_loss_clip": 0.01120568, + "auxiliary_loss_mlp": 0.0112639, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00088692, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.5170847863497956, + "language_loss": 0.79725301, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81972253, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 3.997421979904175 + }, + { + "auxiliary_loss_clip": 0.01170714, + "auxiliary_loss_mlp": 0.01126151, + "balance_loss_clip": 1.00222921, + "balance_loss_mlp": 1.00064754, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 1.8896256188380542, + "language_loss": 0.77368128, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79664993, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.5983331203460693 + }, + { + "auxiliary_loss_clip": 0.01155658, + "auxiliary_loss_mlp": 0.01127433, + "balance_loss_clip": 1.00214505, + "balance_loss_mlp": 1.00078535, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.6955545459303942, + "language_loss": 0.65797436, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68080527, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.713818311691284 + }, + { + "auxiliary_loss_clip": 0.01138591, + "auxiliary_loss_mlp": 0.01125742, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.00071514, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.7546672270890296, + "language_loss": 0.83490223, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85754555, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.6079325675964355 + }, + { + "auxiliary_loss_clip": 0.01153932, + "auxiliary_loss_mlp": 0.01125428, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00087857, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.7832000779402857, + "language_loss": 0.78985125, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81264484, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.5570104122161865 + }, + { + "auxiliary_loss_clip": 0.01122357, + "auxiliary_loss_mlp": 0.01125922, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00079989, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 4.838981207371734, + "language_loss": 0.79415858, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81664139, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.598346471786499 + }, + { + "auxiliary_loss_clip": 0.01153598, + "auxiliary_loss_mlp": 0.0074853, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00156343, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.6369039450687537, + "language_loss": 0.70453048, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72355175, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.5449490547180176 + }, + { + "auxiliary_loss_clip": 0.01153849, + "auxiliary_loss_mlp": 0.01125789, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00085795, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4420344982298983, + "language_loss": 0.59159994, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61439633, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.76583194732666 + }, + { + "auxiliary_loss_clip": 0.01170538, + "auxiliary_loss_mlp": 0.01125436, + "balance_loss_clip": 1.00217712, + "balance_loss_mlp": 1.00079095, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.1475723411916996, + "language_loss": 0.73468029, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75764, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.4977526664733887 + }, + { + "auxiliary_loss_clip": 0.01077085, + "auxiliary_loss_mlp": 0.01125385, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.00064445, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.5949515966876773, + "language_loss": 0.74477845, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76680315, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.7197179794311523 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.01125514, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00067782, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 1.8669765890515742, + "language_loss": 0.79458386, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81724, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.5570597648620605 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.01125811, + "balance_loss_clip": 1.00217187, + "balance_loss_mlp": 1.00078404, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.6432823678509547, + "language_loss": 0.75977516, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78243774, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.59832763671875 + }, + { + "auxiliary_loss_clip": 0.01152103, + "auxiliary_loss_mlp": 0.01108344, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00000632, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7057151264967789, + "language_loss": 0.52805966, + "learning_rate": 2.650610514218691e-06, + "loss": 0.55066413, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.0953242778778076 + }, + { + "auxiliary_loss_clip": 0.01170686, + "auxiliary_loss_mlp": 0.01126159, + "balance_loss_clip": 1.00216198, + "balance_loss_mlp": 1.00075102, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.9515394352972568, + "language_loss": 0.72842139, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.75138986, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.5364508628845215 + }, + { + "auxiliary_loss_clip": 0.01167216, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00010097, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9152404813620574, + "language_loss": 0.66582215, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68857861, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 2.9811689853668213 + }, + { + "auxiliary_loss_clip": 0.01170438, + "auxiliary_loss_mlp": 0.01125164, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00070953, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.668929181509956, + "language_loss": 0.81002629, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8329823, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.4715216159820557 + }, + { + "auxiliary_loss_clip": 0.01137078, + "auxiliary_loss_mlp": 0.01125731, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00070465, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 1.8848930670940653, + "language_loss": 0.77215272, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79478085, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.6707870960235596 + }, + { + "auxiliary_loss_clip": 0.01150525, + "auxiliary_loss_mlp": 0.0110872, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.0003823, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8670708337491455, + "language_loss": 0.57835209, + "learning_rate": 2.64876881365164e-06, + "loss": 0.60094452, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.876234292984009 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_clip": 1.00214446, + "balance_loss_mlp": 1.00071406, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.973220002697913, + "language_loss": 0.75658393, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77937567, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.614642381668091 + }, + { + "auxiliary_loss_clip": 0.01120492, + "auxiliary_loss_mlp": 0.01125251, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00079632, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.669173216431443, + "language_loss": 0.83241117, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85486865, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.6279962062835693 + }, + { + "auxiliary_loss_clip": 0.01121708, + "auxiliary_loss_mlp": 0.01126232, + "balance_loss_clip": 1.00212538, + "balance_loss_mlp": 1.00091934, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 2.3698215700665393, + "language_loss": 0.68460238, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70708179, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.658757209777832 + }, + { + "auxiliary_loss_clip": 0.01139279, + "auxiliary_loss_mlp": 0.01125319, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.00076914, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.8834527426141259, + "language_loss": 0.75672996, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.77937591, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.5794012546539307 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01126211, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.00070739, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.7580234800791137, + "language_loss": 0.83257788, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85522801, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.60874080657959 + }, + { + "auxiliary_loss_clip": 0.01123392, + "auxiliary_loss_mlp": 0.01125831, + "balance_loss_clip": 1.00174963, + "balance_loss_mlp": 1.00061405, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.6861627733435933, + "language_loss": 0.7169649, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73945713, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.5915400981903076 + }, + { + "auxiliary_loss_clip": 0.01140355, + "auxiliary_loss_mlp": 0.0112526, + "balance_loss_clip": 1.00212872, + "balance_loss_mlp": 1.00099671, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.470707916280528, + "language_loss": 0.82750118, + "learning_rate": 2.646189399991154e-06, + "loss": 0.85015726, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.6170332431793213 + }, + { + "auxiliary_loss_clip": 0.0115396, + "auxiliary_loss_mlp": 0.01126334, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00073504, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.4047388826696676, + "language_loss": 0.65306985, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6758728, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.511895179748535 + }, + { + "auxiliary_loss_clip": 0.01153562, + "auxiliary_loss_mlp": 0.01125823, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00079632, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.9620307417733431, + "language_loss": 0.76252759, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78532147, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.591470956802368 + }, + { + "auxiliary_loss_clip": 0.01153586, + "auxiliary_loss_mlp": 0.00748461, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00162363, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.847075899892968, + "language_loss": 0.801889, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.8209095, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.579812526702881 + }, + { + "auxiliary_loss_clip": 0.01170534, + "auxiliary_loss_mlp": 0.01126241, + "balance_loss_clip": 1.00223875, + "balance_loss_mlp": 1.00083244, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.841277050632883, + "language_loss": 0.84878045, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87174821, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 4.083536148071289 + }, + { + "auxiliary_loss_clip": 0.0113747, + "auxiliary_loss_mlp": 0.0112602, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00061202, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.7211387052940648, + "language_loss": 0.70414269, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72677761, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.601979970932007 + }, + { + "auxiliary_loss_clip": 0.01170391, + "auxiliary_loss_mlp": 0.01125909, + "balance_loss_clip": 1.0021925, + "balance_loss_mlp": 1.00107348, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.868205033308398, + "language_loss": 0.81125748, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83422041, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.5239953994750977 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.01126751, + "balance_loss_clip": 1.00211024, + "balance_loss_mlp": 1.00105667, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.961257016722182, + "language_loss": 0.69800162, + "learning_rate": 2.643608785656077e-06, + "loss": 0.72065508, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 2.5862178802490234 + }, + { + "auxiliary_loss_clip": 0.01155322, + "auxiliary_loss_mlp": 0.0112542, + "balance_loss_clip": 1.00219011, + "balance_loss_mlp": 1.00077486, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7729854337757918, + "language_loss": 0.75646263, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77927005, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.5619568824768066 + }, + { + "auxiliary_loss_clip": 0.01127734, + "auxiliary_loss_mlp": 0.01125329, + "balance_loss_clip": 1.00276935, + "balance_loss_mlp": 1.00097013, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.359256278656846, + "language_loss": 0.75556552, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7780962, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 4.07699728012085 + }, + { + "auxiliary_loss_clip": 0.01170455, + "auxiliary_loss_mlp": 0.01126128, + "balance_loss_clip": 1.00211012, + "balance_loss_mlp": 1.00081468, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 2.3029385143283116, + "language_loss": 0.70435703, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.72732282, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.543639898300171 + }, + { + "auxiliary_loss_clip": 0.01170503, + "auxiliary_loss_mlp": 0.0074854, + "balance_loss_clip": 1.00218451, + "balance_loss_mlp": 1.00174093, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5679616809821098, + "language_loss": 0.7554301, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77462053, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.5121617317199707 + }, + { + "auxiliary_loss_clip": 0.01153664, + "auxiliary_loss_mlp": 0.0112466, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00058675, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 2.079489804625668, + "language_loss": 0.70202327, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72480649, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.634671449661255 + }, + { + "auxiliary_loss_clip": 0.01170365, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_clip": 1.00212085, + "balance_loss_mlp": 1.00063753, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.6976096332621229, + "language_loss": 0.76076567, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78371644, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 3.976658582687378 + }, + { + "auxiliary_loss_clip": 0.01121796, + "auxiliary_loss_mlp": 0.00748412, + "balance_loss_clip": 1.00215554, + "balance_loss_mlp": 1.00170207, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.6423098798222682, + "language_loss": 0.80236286, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82106501, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.694704294204712 + }, + { + "auxiliary_loss_clip": 0.01170302, + "auxiliary_loss_mlp": 0.01125616, + "balance_loss_clip": 1.00214958, + "balance_loss_mlp": 1.00087523, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.8277629015389283, + "language_loss": 0.74390697, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76686615, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 3.8772733211517334 + }, + { + "auxiliary_loss_clip": 0.01108005, + "auxiliary_loss_mlp": 0.01125855, + "balance_loss_clip": 1.00184131, + "balance_loss_mlp": 1.00082803, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.6550818108650285, + "language_loss": 0.84076136, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86309993, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.6572999954223633 + }, + { + "auxiliary_loss_clip": 0.01123434, + "auxiliary_loss_mlp": 0.00748531, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00173903, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.5745176383992976, + "language_loss": 0.70258427, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72130394, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.765765428543091 + }, + { + "auxiliary_loss_clip": 0.01170405, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_clip": 1.00219238, + "balance_loss_mlp": 1.00065231, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.8612396691065214, + "language_loss": 0.72828025, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75124013, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.5682899951934814 + }, + { + "auxiliary_loss_clip": 0.01155286, + "auxiliary_loss_mlp": 0.01125243, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00059724, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 4.94187608524638, + "language_loss": 0.62514246, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64794773, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.532827854156494 + }, + { + "auxiliary_loss_clip": 0.01121724, + "auxiliary_loss_mlp": 0.01125281, + "balance_loss_clip": 1.00201046, + "balance_loss_mlp": 1.0008266, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.663228701949392, + "language_loss": 0.70616305, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72863317, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.6916263103485107 + }, + { + "auxiliary_loss_clip": 0.01170359, + "auxiliary_loss_mlp": 0.01126129, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00110197, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.7714624384868771, + "language_loss": 0.73175484, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75471967, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.6236236095428467 + }, + { + "auxiliary_loss_clip": 0.01153699, + "auxiliary_loss_mlp": 0.01125583, + "balance_loss_clip": 1.00224805, + "balance_loss_mlp": 1.00084233, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.5429466756348753, + "language_loss": 0.84652424, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.869317, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.58296275138855 + }, + { + "auxiliary_loss_clip": 0.0110481, + "auxiliary_loss_mlp": 0.01125151, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00060177, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.6026077086003294, + "language_loss": 0.74481773, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76711732, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.733337879180908 + }, + { + "auxiliary_loss_clip": 0.01123595, + "auxiliary_loss_mlp": 0.0112603, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00071681, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.884069683032169, + "language_loss": 0.75862008, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.78111631, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.701322555541992 + }, + { + "auxiliary_loss_clip": 0.01153805, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00087333, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.1346584817541148, + "language_loss": 0.80002564, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82281601, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.6702303886413574 + }, + { + "auxiliary_loss_clip": 0.01121236, + "auxiliary_loss_mlp": 0.01124316, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.0006249, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.6155660736090407, + "language_loss": 0.69726193, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71971744, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.675422430038452 + }, + { + "auxiliary_loss_clip": 0.01136657, + "auxiliary_loss_mlp": 0.00748418, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.00166345, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 1.6235751376224805, + "language_loss": 0.83982247, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85867321, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.570626974105835 + }, + { + "auxiliary_loss_clip": 0.01170398, + "auxiliary_loss_mlp": 0.01126674, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00059879, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8088043846557902, + "language_loss": 0.67983484, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70280558, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.5844011306762695 + }, + { + "auxiliary_loss_clip": 0.01170417, + "auxiliary_loss_mlp": 0.00748477, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00156403, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.733412084551148, + "language_loss": 0.77588922, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79507816, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.5311312675476074 + }, + { + "auxiliary_loss_clip": 0.01170628, + "auxiliary_loss_mlp": 0.0112547, + "balance_loss_clip": 1.00227427, + "balance_loss_mlp": 1.00072896, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.5835633179881332, + "language_loss": 0.68514419, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70810521, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.5271410942077637 + }, + { + "auxiliary_loss_clip": 0.01138208, + "auxiliary_loss_mlp": 0.01124795, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00062704, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.3268482638304118, + "language_loss": 0.67362171, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69625175, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.576054811477661 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.01124823, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00065482, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8707286051286656, + "language_loss": 0.77030563, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79276335, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.6260757446289062 + }, + { + "auxiliary_loss_clip": 0.01137197, + "auxiliary_loss_mlp": 0.01107944, + "balance_loss_clip": 1.00196528, + "balance_loss_mlp": 1.00036991, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7631997749101568, + "language_loss": 0.64840251, + "learning_rate": 2.634013214657026e-06, + "loss": 0.67085391, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.131758213043213 + }, + { + "auxiliary_loss_clip": 0.01126517, + "auxiliary_loss_mlp": 0.01124749, + "balance_loss_clip": 1.00272918, + "balance_loss_mlp": 1.00077128, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.8381950190025134, + "language_loss": 0.87024474, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89275742, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.6276421546936035 + }, + { + "auxiliary_loss_clip": 0.01152267, + "auxiliary_loss_mlp": 0.01107607, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00003231, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8006555208918384, + "language_loss": 0.62121427, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64381295, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.079169988632202 + }, + { + "auxiliary_loss_clip": 0.01170713, + "auxiliary_loss_mlp": 0.01126482, + "balance_loss_clip": 1.00231755, + "balance_loss_mlp": 1.00069213, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 3.869091143797942, + "language_loss": 0.87594354, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.89891559, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.4841856956481934 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01124805, + "balance_loss_clip": 1.00227642, + "balance_loss_mlp": 1.00092292, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 1.8316478541929455, + "language_loss": 0.62903464, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65181935, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.584545373916626 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.00748425, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00158346, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.6486686975213818, + "language_loss": 0.75735456, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77622533, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 2.602005958557129 + }, + { + "auxiliary_loss_clip": 0.01105386, + "auxiliary_loss_mlp": 0.01125462, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00081682, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.6817671434235546, + "language_loss": 0.8783679, + "learning_rate": 2.631796535141458e-06, + "loss": 0.90067637, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.703350067138672 + }, + { + "auxiliary_loss_clip": 0.011447, + "auxiliary_loss_mlp": 0.01125547, + "balance_loss_clip": 1.00249219, + "balance_loss_mlp": 1.00090194, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.165015115886443, + "language_loss": 0.71323729, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73593968, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 2.58966064453125 + }, + { + "auxiliary_loss_clip": 0.01170402, + "auxiliary_loss_mlp": 0.01126265, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.0006659, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.3428424334813889, + "language_loss": 0.71671665, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73968333, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 3.959934711456299 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01125166, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00061643, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.4048056715288308, + "language_loss": 0.81056678, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83324456, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.6003987789154053 + }, + { + "auxiliary_loss_clip": 0.01153566, + "auxiliary_loss_mlp": 0.01125478, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.00083232, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.2987626875174858, + "language_loss": 0.70187724, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72466767, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.7267138957977295 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01125777, + "balance_loss_clip": 1.00214911, + "balance_loss_mlp": 1.00075054, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 1.850450199978395, + "language_loss": 0.81532103, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83796704, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.5661396980285645 + }, + { + "auxiliary_loss_clip": 0.01136866, + "auxiliary_loss_mlp": 0.01125881, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00075865, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 2.1203716774710153, + "language_loss": 0.65282553, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67545307, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.5649471282958984 + }, + { + "auxiliary_loss_clip": 0.01138543, + "auxiliary_loss_mlp": 0.01126407, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00090301, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.4084490966184995, + "language_loss": 0.80847597, + "learning_rate": 2.629209319173274e-06, + "loss": 0.83112556, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.549510955810547 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.01126216, + "balance_loss_clip": 1.00216329, + "balance_loss_mlp": 1.00080752, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.528800058407854, + "language_loss": 0.67851901, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70115221, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 4.0203375816345215 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01126395, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00089133, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.8568753615049272, + "language_loss": 0.75729805, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.77995372, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.644479274749756 + }, + { + "auxiliary_loss_clip": 0.01170372, + "auxiliary_loss_mlp": 0.01125473, + "balance_loss_clip": 1.00212204, + "balance_loss_mlp": 1.00073206, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.6979403934433424, + "language_loss": 0.7292645, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75222301, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.5299856662750244 + }, + { + "auxiliary_loss_clip": 0.01138288, + "auxiliary_loss_mlp": 0.0112478, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00061154, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.1009344817023203, + "language_loss": 0.8413713, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86400193, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.559951066970825 + }, + { + "auxiliary_loss_clip": 0.01137139, + "auxiliary_loss_mlp": 0.01125002, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00064301, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6133164477112425, + "language_loss": 0.86217421, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88479567, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 4.067287445068359 + }, + { + "auxiliary_loss_clip": 0.01153894, + "auxiliary_loss_mlp": 0.01125574, + "balance_loss_clip": 1.00219309, + "balance_loss_mlp": 1.00083303, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.4782530562000704, + "language_loss": 0.7204634, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74325806, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 3.9453673362731934 + }, + { + "auxiliary_loss_clip": 0.01136864, + "auxiliary_loss_mlp": 0.01125916, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00079417, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.0737403490866924, + "language_loss": 0.78018093, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.8028087, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": -0.036283016204833984 + }, + { + "auxiliary_loss_clip": 0.01170356, + "auxiliary_loss_mlp": 0.01125343, + "balance_loss_clip": 1.00219166, + "balance_loss_mlp": 1.00069809, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.9592169945071767, + "language_loss": 0.71064961, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73360664, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.491232395172119 + }, + { + "auxiliary_loss_clip": 0.01136853, + "auxiliary_loss_mlp": 0.01125973, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00085139, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7857741483286718, + "language_loss": 0.81004548, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83267379, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.5830602645874023 + }, + { + "auxiliary_loss_clip": 0.01106893, + "auxiliary_loss_mlp": 0.01124807, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00063848, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 4.9474437467804, + "language_loss": 0.79048061, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81279761, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.696960210800171 + }, + { + "auxiliary_loss_clip": 0.01136727, + "auxiliary_loss_mlp": 0.0074842, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00159156, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 1.9381157767249362, + "language_loss": 0.81468904, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83354056, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.6572675704956055 + }, + { + "auxiliary_loss_clip": 0.01170392, + "auxiliary_loss_mlp": 0.0112586, + "balance_loss_clip": 1.00206506, + "balance_loss_mlp": 1.00064254, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.303925576657727, + "language_loss": 0.77011573, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79307824, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.4940109252929688 + }, + { + "auxiliary_loss_clip": 0.01153486, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_clip": 1.00205886, + "balance_loss_mlp": 1.0006938, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.8643588306243222, + "language_loss": 0.6757009, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69848728, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.5409915447235107 + }, + { + "auxiliary_loss_clip": 0.01136811, + "auxiliary_loss_mlp": 0.01125358, + "balance_loss_clip": 1.00208044, + "balance_loss_mlp": 1.00090361, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.5486384092997345, + "language_loss": 0.73515821, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75777996, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.548072099685669 + }, + { + "auxiliary_loss_clip": 0.01153715, + "auxiliary_loss_mlp": 0.01125142, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00097382, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 2.757277336215755, + "language_loss": 0.7332164, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75600499, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.5163683891296387 + }, + { + "auxiliary_loss_clip": 0.01137032, + "auxiliary_loss_mlp": 0.01125021, + "balance_loss_clip": 1.00208187, + "balance_loss_mlp": 1.00075698, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 4.361705479129161, + "language_loss": 0.84122276, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86384332, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.6501362323760986 + }, + { + "auxiliary_loss_clip": 0.01143302, + "auxiliary_loss_mlp": 0.01126033, + "balance_loss_clip": 1.00267816, + "balance_loss_mlp": 1.0007205, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 1.7824868572381054, + "language_loss": 0.74596083, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76865417, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.631289482116699 + }, + { + "auxiliary_loss_clip": 0.0115472, + "auxiliary_loss_mlp": 0.01125471, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00073016, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.712762018423959, + "language_loss": 0.75035322, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77315509, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.569373846054077 + }, + { + "auxiliary_loss_clip": 0.01170483, + "auxiliary_loss_mlp": 0.01124941, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00086784, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.5680113769396817, + "language_loss": 0.71170294, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73465717, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.5424177646636963 + }, + { + "auxiliary_loss_clip": 0.01153977, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_clip": 1.00214851, + "balance_loss_mlp": 1.00068665, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 2.0472422540885127, + "language_loss": 0.73478878, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75758755, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.59687876701355 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.0112571, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00068295, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.52043736299753, + "language_loss": 0.72882414, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.75132036, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.634856939315796 + }, + { + "auxiliary_loss_clip": 0.01123653, + "auxiliary_loss_mlp": 0.00748489, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00165749, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.8943525700939277, + "language_loss": 0.63723016, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65595162, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.7104134559631348 + }, + { + "auxiliary_loss_clip": 0.01140052, + "auxiliary_loss_mlp": 0.01124714, + "balance_loss_clip": 1.0021379, + "balance_loss_mlp": 1.00064087, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.4872974448544636, + "language_loss": 0.70094478, + "learning_rate": 2.620700260921513e-06, + "loss": 0.7235924, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.6214590072631836 + }, + { + "auxiliary_loss_clip": 0.01123407, + "auxiliary_loss_mlp": 0.01125119, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00076008, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.6363560941498891, + "language_loss": 0.80869174, + "learning_rate": 2.620330018187899e-06, + "loss": 0.831177, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.6232590675354004 + }, + { + "auxiliary_loss_clip": 0.01159933, + "auxiliary_loss_mlp": 0.0112429, + "balance_loss_clip": 1.00265491, + "balance_loss_mlp": 1.00069368, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.273356613646233, + "language_loss": 0.77693057, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79977274, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.528517484664917 + }, + { + "auxiliary_loss_clip": 0.0117039, + "auxiliary_loss_mlp": 0.01125327, + "balance_loss_clip": 1.00227427, + "balance_loss_mlp": 1.00087285, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 1.5174358409561883, + "language_loss": 0.71329427, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73625141, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.600682020187378 + }, + { + "auxiliary_loss_clip": 0.01153551, + "auxiliary_loss_mlp": 0.01124558, + "balance_loss_clip": 1.00211084, + "balance_loss_mlp": 1.00067568, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.4873248678006579, + "language_loss": 0.76838648, + "learning_rate": 2.619219148905362e-06, + "loss": 0.7911675, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.6576638221740723 + }, + { + "auxiliary_loss_clip": 0.01138802, + "auxiliary_loss_mlp": 0.01125116, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00094819, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.8389033155493038, + "language_loss": 0.81897974, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84161896, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.5926144123077393 + }, + { + "auxiliary_loss_clip": 0.01137071, + "auxiliary_loss_mlp": 0.00748246, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.0016197, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.6674836635354753, + "language_loss": 0.76056653, + "learning_rate": 2.618478451956007e-06, + "loss": 0.77941978, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 2.6356916427612305 + }, + { + "auxiliary_loss_clip": 0.01103952, + "auxiliary_loss_mlp": 0.01125153, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.0006032, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 2.2513943086179795, + "language_loss": 0.73034179, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75263286, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.6442155838012695 + }, + { + "auxiliary_loss_clip": 0.01153973, + "auxiliary_loss_mlp": 0.01125096, + "balance_loss_clip": 1.00227356, + "balance_loss_mlp": 1.00073659, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.5263973310504206, + "language_loss": 0.72133255, + "learning_rate": 2.617737661195593e-06, + "loss": 0.74412322, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.5077826976776123 + }, + { + "auxiliary_loss_clip": 0.0115491, + "auxiliary_loss_mlp": 0.01124882, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00071383, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 2.003864524254085, + "language_loss": 0.75955361, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78235149, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 2.55165696144104 + }, + { + "auxiliary_loss_clip": 0.01105236, + "auxiliary_loss_mlp": 0.01124735, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00066209, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 1.9768174269560892, + "language_loss": 0.84460002, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86689973, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 4.075308561325073 + }, + { + "auxiliary_loss_clip": 0.01154406, + "auxiliary_loss_mlp": 0.01124352, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00066018, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.5556922712914665, + "language_loss": 0.82919729, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85198486, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.579254150390625 + }, + { + "auxiliary_loss_clip": 0.01121292, + "auxiliary_loss_mlp": 0.01125171, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00081146, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.0737154630234262, + "language_loss": 0.71520543, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73767006, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.5671679973602295 + }, + { + "auxiliary_loss_clip": 0.01121555, + "auxiliary_loss_mlp": 0.01124741, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00095391, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 1.87251121566398, + "language_loss": 0.75370508, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77616805, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.6194441318511963 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.00748466, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.0018034, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 2.128544057685484, + "language_loss": 0.77074409, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.7893135, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.6678881645202637 + }, + { + "auxiliary_loss_clip": 0.01122047, + "auxiliary_loss_mlp": 0.00748491, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.00167596, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.8412664746364766, + "language_loss": 0.77012599, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78883135, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 4.009449005126953 + }, + { + "auxiliary_loss_clip": 0.01136881, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_clip": 1.00209939, + "balance_loss_mlp": 1.00052631, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8494132731477806, + "language_loss": 0.7560842, + "learning_rate": 2.614773562290835e-06, + "loss": 0.77869141, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.594155788421631 + }, + { + "auxiliary_loss_clip": 0.01119776, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_clip": 1.0017519, + "balance_loss_mlp": 1.00008416, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7851772275935229, + "language_loss": 0.54703099, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56929767, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.1332173347473145 + }, + { + "auxiliary_loss_clip": 0.01153779, + "auxiliary_loss_mlp": 0.01125635, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00089467, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8510583206294273, + "language_loss": 0.85118002, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87397414, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.5109424591064453 + }, + { + "auxiliary_loss_clip": 0.01137193, + "auxiliary_loss_mlp": 0.01124115, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00070906, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.555920734059391, + "language_loss": 0.70133841, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72395146, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.5980560779571533 + }, + { + "auxiliary_loss_clip": 0.01170154, + "auxiliary_loss_mlp": 0.01123513, + "balance_loss_clip": 1.00213253, + "balance_loss_mlp": 1.00067937, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.4898307828935953, + "language_loss": 0.71064293, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73357958, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 3.9948527812957764 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01124388, + "balance_loss_clip": 1.00189221, + "balance_loss_mlp": 1.00069666, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.4675527689955996, + "language_loss": 0.72114879, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74344039, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 4.063812017440796 + }, + { + "auxiliary_loss_clip": 0.01154876, + "auxiliary_loss_mlp": 0.01125053, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00069404, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 1.977858006550397, + "language_loss": 0.71035445, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73315364, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.715104579925537 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01106849, + "balance_loss_clip": 1.00181568, + "balance_loss_mlp": 1.00003803, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6703481459817876, + "language_loss": 0.46246693, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48505083, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.1334736347198486 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.01125293, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00083816, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.7736471563931733, + "language_loss": 0.74887645, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77166378, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.6068782806396484 + }, + { + "auxiliary_loss_clip": 0.01136419, + "auxiliary_loss_mlp": 0.01124263, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00076199, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.8085390124941194, + "language_loss": 0.8038367, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82644349, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.630702495574951 + }, + { + "auxiliary_loss_clip": 0.01153446, + "auxiliary_loss_mlp": 0.01124281, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00078034, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.7817153769938734, + "language_loss": 0.83218652, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85496384, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.551161527633667 + }, + { + "auxiliary_loss_clip": 0.0113713, + "auxiliary_loss_mlp": 0.0112519, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00092649, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 2.1988650902358313, + "language_loss": 0.74971008, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77233326, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.5877346992492676 + }, + { + "auxiliary_loss_clip": 0.01139878, + "auxiliary_loss_mlp": 0.01123647, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.00062323, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.582255953343518, + "language_loss": 0.72608232, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74871761, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.743541955947876 + }, + { + "auxiliary_loss_clip": 0.0112034, + "auxiliary_loss_mlp": 0.01125357, + "balance_loss_clip": 1.00201237, + "balance_loss_mlp": 1.00090265, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.1036767703600647, + "language_loss": 0.75103891, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77349585, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.657214403152466 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01124053, + "balance_loss_clip": 1.00203478, + "balance_loss_mlp": 1.00055194, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.7383710502120737, + "language_loss": 0.72861338, + "learning_rate": 2.609582803447259e-06, + "loss": 0.75139976, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.5298283100128174 + }, + { + "auxiliary_loss_clip": 0.01155296, + "auxiliary_loss_mlp": 0.01125285, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.00073552, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.5167453427346573, + "language_loss": 0.80933702, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83214283, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.6475882530212402 + }, + { + "auxiliary_loss_clip": 0.01144145, + "auxiliary_loss_mlp": 0.01124513, + "balance_loss_clip": 1.00223947, + "balance_loss_mlp": 1.00053501, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.1253189441445195, + "language_loss": 0.68106127, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70374787, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.580281972885132 + }, + { + "auxiliary_loss_clip": 0.0115346, + "auxiliary_loss_mlp": 0.01124259, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00075781, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.3286513297381513, + "language_loss": 0.81148851, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83426571, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.558924436569214 + }, + { + "auxiliary_loss_clip": 0.01170214, + "auxiliary_loss_mlp": 0.0112478, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00089812, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.5971815856905542, + "language_loss": 0.82579893, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84874886, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.5454201698303223 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01124289, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00078845, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.7302193373761476, + "language_loss": 0.83393151, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85687441, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.524380922317505 + }, + { + "auxiliary_loss_clip": 0.01170237, + "auxiliary_loss_mlp": 0.01124381, + "balance_loss_clip": 1.00213873, + "balance_loss_mlp": 1.00078475, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.4263270820638105, + "language_loss": 0.79674113, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81968725, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.4995126724243164 + }, + { + "auxiliary_loss_clip": 0.01121192, + "auxiliary_loss_mlp": 0.01123255, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.00061297, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6545475471202267, + "language_loss": 0.84618163, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.868626, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.6492974758148193 + }, + { + "auxiliary_loss_clip": 0.01159198, + "auxiliary_loss_mlp": 0.01125305, + "balance_loss_clip": 1.0020963, + "balance_loss_mlp": 1.00065935, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 6.140944971746361, + "language_loss": 0.56714261, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58998764, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.5754597187042236 + }, + { + "auxiliary_loss_clip": 0.01153297, + "auxiliary_loss_mlp": 0.01124679, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00070155, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 1.7460011022290438, + "language_loss": 0.82377023, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84654999, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.5421128273010254 + }, + { + "auxiliary_loss_clip": 0.01155377, + "auxiliary_loss_mlp": 0.01124142, + "balance_loss_clip": 1.00219834, + "balance_loss_mlp": 1.00064158, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.584301643416937, + "language_loss": 0.78948593, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81228113, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.5523364543914795 + }, + { + "auxiliary_loss_clip": 0.01170231, + "auxiliary_loss_mlp": 0.01125341, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.0006001, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.7183843862785608, + "language_loss": 0.78225237, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80520809, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01139029, + "auxiliary_loss_mlp": 0.01123708, + "balance_loss_clip": 1.00201845, + "balance_loss_mlp": 1.00068378, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 2.5490850984446918, + "language_loss": 0.7224673, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74509466, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.6285457611083984 + }, + { + "auxiliary_loss_clip": 0.01139984, + "auxiliary_loss_mlp": 0.00748439, + "balance_loss_clip": 1.00216961, + "balance_loss_mlp": 1.00178814, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4907141793209737, + "language_loss": 0.74998087, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76886511, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.718066930770874 + }, + { + "auxiliary_loss_clip": 0.01153583, + "auxiliary_loss_mlp": 0.01124986, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00062728, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.4935222573549292, + "language_loss": 0.7417298, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76451552, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.5876519680023193 + }, + { + "auxiliary_loss_clip": 0.01137993, + "auxiliary_loss_mlp": 0.01124651, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00067329, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.9490330868229346, + "language_loss": 0.71042788, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73305428, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.5872697830200195 + }, + { + "auxiliary_loss_clip": 0.01150239, + "auxiliary_loss_mlp": 0.00746735, + "balance_loss_clip": 1.00169659, + "balance_loss_mlp": 1.00059342, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8338259318801416, + "language_loss": 0.60428047, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62325013, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 3.011702299118042 + }, + { + "auxiliary_loss_clip": 0.01170346, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_clip": 1.0022676, + "balance_loss_mlp": 1.00071478, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.8541241925773275, + "language_loss": 0.83186328, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85482132, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.5761053562164307 + }, + { + "auxiliary_loss_clip": 0.01166925, + "auxiliary_loss_mlp": 0.01106799, + "balance_loss_clip": 1.00200641, + "balance_loss_mlp": 0.99998778, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8144432914800864, + "language_loss": 0.65510237, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67783964, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 4.502557277679443 + }, + { + "auxiliary_loss_clip": 0.01170348, + "auxiliary_loss_mlp": 0.01125828, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00089669, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.9635641988293107, + "language_loss": 0.8372314, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.86019313, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.4902775287628174 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0074859, + "balance_loss_clip": 1.00221753, + "balance_loss_mlp": 1.0018239, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.642317415947561, + "language_loss": 0.77972364, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.79876125, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.5633225440979004 + }, + { + "auxiliary_loss_clip": 0.01124794, + "auxiliary_loss_mlp": 0.01124313, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.3999916023546997, + "language_loss": 0.80167747, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82416856, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.7024152278900146 + }, + { + "auxiliary_loss_clip": 0.01138453, + "auxiliary_loss_mlp": 0.00748581, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00179136, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 1.8862253128450677, + "language_loss": 0.7541461, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77301633, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.585280179977417 + }, + { + "auxiliary_loss_clip": 0.01170117, + "auxiliary_loss_mlp": 0.01124506, + "balance_loss_clip": 1.00199497, + "balance_loss_mlp": 1.00081468, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.7319187156266933, + "language_loss": 0.75287312, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77581936, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 3.8898699283599854 + }, + { + "auxiliary_loss_clip": 0.01170335, + "auxiliary_loss_mlp": 0.0112522, + "balance_loss_clip": 1.00227904, + "balance_loss_mlp": 1.00095677, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.7234440837904819, + "language_loss": 0.76121545, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78417104, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.591665744781494 + }, + { + "auxiliary_loss_clip": 0.01124082, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.0008918, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.8645156377623997, + "language_loss": 0.64168423, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66417563, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.665138006210327 + }, + { + "auxiliary_loss_clip": 0.01121194, + "auxiliary_loss_mlp": 0.0112546, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00071979, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.8830613426587977, + "language_loss": 0.76353669, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78600323, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.626481533050537 + }, + { + "auxiliary_loss_clip": 0.01125356, + "auxiliary_loss_mlp": 0.00748486, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.0018028, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.438834960902246, + "language_loss": 0.86649436, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88523281, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.631305694580078 + }, + { + "auxiliary_loss_clip": 0.01120136, + "auxiliary_loss_mlp": 0.01124439, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00074744, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.9552496784826567, + "language_loss": 0.67812169, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70056742, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 4.039630889892578 + }, + { + "auxiliary_loss_clip": 0.01170175, + "auxiliary_loss_mlp": 0.01125295, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00074506, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 1.9364148778129346, + "language_loss": 0.77387106, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79682577, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 4.0022478103637695 + }, + { + "auxiliary_loss_clip": 0.01170112, + "auxiliary_loss_mlp": 0.01124886, + "balance_loss_clip": 1.00219274, + "balance_loss_mlp": 1.00081301, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.919808152662478, + "language_loss": 0.68539965, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70834959, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.51928448677063 + }, + { + "auxiliary_loss_clip": 0.01153515, + "auxiliary_loss_mlp": 0.01124955, + "balance_loss_clip": 1.00214136, + "balance_loss_mlp": 1.00078642, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.8697666303011207, + "language_loss": 0.72081596, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74360061, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.572509765625 + }, + { + "auxiliary_loss_clip": 0.0117021, + "auxiliary_loss_mlp": 0.01125027, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00066757, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.8307213932178945, + "language_loss": 0.70589632, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.7288487, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.519413471221924 + }, + { + "auxiliary_loss_clip": 0.01137804, + "auxiliary_loss_mlp": 0.0074845, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00186682, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7814204910110172, + "language_loss": 0.82320857, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84207112, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.558892250061035 + }, + { + "auxiliary_loss_clip": 0.01119657, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00086164, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.8099066950855636, + "language_loss": 0.72008884, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74254048, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.6667075157165527 + }, + { + "auxiliary_loss_clip": 0.01170343, + "auxiliary_loss_mlp": 0.01125019, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00066018, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 6.036266966195665, + "language_loss": 0.66097116, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68392479, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.570892333984375 + }, + { + "auxiliary_loss_clip": 0.01122585, + "auxiliary_loss_mlp": 0.01124456, + "balance_loss_clip": 1.00190961, + "balance_loss_mlp": 1.00076389, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.5085146538180703, + "language_loss": 0.72130144, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74377185, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.637737989425659 + }, + { + "auxiliary_loss_clip": 0.01152021, + "auxiliary_loss_mlp": 0.01107013, + "balance_loss_clip": 1.00185525, + "balance_loss_mlp": 1.00020111, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7949659898839686, + "language_loss": 0.54391837, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56650871, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.0194859504699707 + }, + { + "auxiliary_loss_clip": 0.01153413, + "auxiliary_loss_mlp": 0.01125352, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00080204, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3982237467103658, + "language_loss": 0.78651845, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80930614, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.5928268432617188 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01124788, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00080991, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.7404686354822663, + "language_loss": 0.81540763, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83835846, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.5343008041381836 + }, + { + "auxiliary_loss_clip": 0.01155321, + "auxiliary_loss_mlp": 0.01125274, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00081944, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.453297196927243, + "language_loss": 0.77723157, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.8000375, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.6142492294311523 + }, + { + "auxiliary_loss_clip": 0.01170257, + "auxiliary_loss_mlp": 0.0112588, + "balance_loss_clip": 1.00221646, + "balance_loss_mlp": 1.00075746, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.574932093482577, + "language_loss": 0.82199287, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84495425, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.578061103820801 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.01124516, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00072861, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 1.9941932869062666, + "language_loss": 0.67681062, + "learning_rate": 2.593983497660586e-06, + "loss": 0.69928789, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.634511709213257 + }, + { + "auxiliary_loss_clip": 0.01149967, + "auxiliary_loss_mlp": 0.01106957, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00014591, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6771893718493305, + "language_loss": 0.59414148, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61671072, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.185013771057129 + }, + { + "auxiliary_loss_clip": 0.01155145, + "auxiliary_loss_mlp": 0.01125706, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00067878, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 2.0058213568689163, + "language_loss": 0.75157315, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77438164, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.5243747234344482 + }, + { + "auxiliary_loss_clip": 0.01143433, + "auxiliary_loss_mlp": 0.01125134, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00067949, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.8492684179948324, + "language_loss": 0.69028962, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71297526, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.5967650413513184 + }, + { + "auxiliary_loss_clip": 0.0113805, + "auxiliary_loss_mlp": 0.00748435, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00175154, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.6361862876187536, + "language_loss": 0.81195331, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83081812, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.6104960441589355 + }, + { + "auxiliary_loss_clip": 0.01092765, + "auxiliary_loss_mlp": 0.01125127, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00067234, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7325001284415777, + "language_loss": 0.69945818, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72163707, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.8098104000091553 + }, + { + "auxiliary_loss_clip": 0.01154653, + "auxiliary_loss_mlp": 0.01123751, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.00082254, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.5737081422715191, + "language_loss": 0.67529029, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69807434, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.647296905517578 + }, + { + "auxiliary_loss_clip": 0.01138221, + "auxiliary_loss_mlp": 0.01124716, + "balance_loss_clip": 1.00217533, + "balance_loss_mlp": 1.00083327, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5441712060909354, + "language_loss": 0.69552183, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71815121, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.610471487045288 + }, + { + "auxiliary_loss_clip": 0.01170238, + "auxiliary_loss_mlp": 0.0112454, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00075293, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.709575027375214, + "language_loss": 0.76716626, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79011405, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.54198956489563 + }, + { + "auxiliary_loss_clip": 0.0113858, + "auxiliary_loss_mlp": 0.0112416, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00084996, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 4.191897718128919, + "language_loss": 0.79735839, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81998575, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.6004931926727295 + }, + { + "auxiliary_loss_clip": 0.01166904, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00017154, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7251487225044272, + "language_loss": 0.61947405, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64221293, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.175344228744507 + }, + { + "auxiliary_loss_clip": 0.01170141, + "auxiliary_loss_mlp": 0.0112415, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.00074422, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.3277794480465412, + "language_loss": 0.7072413, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73018414, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 2.555845260620117 + }, + { + "auxiliary_loss_clip": 0.01142179, + "auxiliary_loss_mlp": 0.01125759, + "balance_loss_clip": 1.00214112, + "balance_loss_mlp": 1.00073278, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.8291062772312918, + "language_loss": 0.8251785, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84785795, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0110638, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00084722, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.8611668994565485, + "language_loss": 0.75064933, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77295852, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.6843934059143066 + }, + { + "auxiliary_loss_clip": 0.01170318, + "auxiliary_loss_mlp": 0.01124954, + "balance_loss_clip": 1.00221801, + "balance_loss_mlp": 1.00078583, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 2.1969797377491314, + "language_loss": 0.86539137, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88834411, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.535370349884033 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01124378, + "balance_loss_clip": 1.00211358, + "balance_loss_mlp": 1.00078225, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 1.823710390555348, + "language_loss": 0.73479903, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75757718, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 4.076409339904785 + }, + { + "auxiliary_loss_clip": 0.01138783, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00093496, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.5629293204460286, + "language_loss": 0.69848198, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72112083, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.643338441848755 + }, + { + "auxiliary_loss_clip": 0.01139285, + "auxiliary_loss_mlp": 0.00748575, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.00192308, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 2.154570175323475, + "language_loss": 0.90318251, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92206115, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.622204542160034 + }, + { + "auxiliary_loss_clip": 0.01138355, + "auxiliary_loss_mlp": 0.01124335, + "balance_loss_clip": 1.0019902, + "balance_loss_mlp": 1.00083423, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.5244787130964534, + "language_loss": 0.76938313, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79201007, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.6337761878967285 + }, + { + "auxiliary_loss_clip": 0.0115367, + "auxiliary_loss_mlp": 0.01124666, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00097466, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.7751015533361052, + "language_loss": 0.82497317, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84775651, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 3.9088079929351807 + }, + { + "auxiliary_loss_clip": 0.01136582, + "auxiliary_loss_mlp": 0.01124413, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00072157, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6179068004759805, + "language_loss": 0.70812464, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73073459, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 2.6004185676574707 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.00748641, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00198793, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.6352079528562902, + "language_loss": 0.78277099, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80145204, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.6374998092651367 + }, + { + "auxiliary_loss_clip": 0.01112343, + "auxiliary_loss_mlp": 0.0112564, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00080419, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.248507396845068, + "language_loss": 0.66866219, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69104201, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.633423328399658 + }, + { + "auxiliary_loss_clip": 0.01154743, + "auxiliary_loss_mlp": 0.01126208, + "balance_loss_clip": 1.00209248, + "balance_loss_mlp": 1.00070488, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 2.577886346611386, + "language_loss": 0.75720894, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78001845, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.5848355293273926 + }, + { + "auxiliary_loss_clip": 0.01153327, + "auxiliary_loss_mlp": 0.0112475, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00058126, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.902076676912742, + "language_loss": 0.64613271, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.66891348, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 5.381670713424683 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01124793, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.0006249, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.5129796663790667, + "language_loss": 0.73855275, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76118684, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.948570728302002 + }, + { + "auxiliary_loss_clip": 0.01153499, + "auxiliary_loss_mlp": 0.0112425, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.00065422, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.4170447921149016, + "language_loss": 0.82244253, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84522003, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.6323986053466797 + }, + { + "auxiliary_loss_clip": 0.01136957, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00083923, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 1.9613417630462726, + "language_loss": 0.6476866, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67030716, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.6147398948669434 + }, + { + "auxiliary_loss_clip": 0.0115552, + "auxiliary_loss_mlp": 0.01125605, + "balance_loss_clip": 1.00227106, + "balance_loss_mlp": 1.00096011, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.551944348201046, + "language_loss": 0.75125468, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77406585, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.7012228965759277 + }, + { + "auxiliary_loss_clip": 0.01122418, + "auxiliary_loss_mlp": 0.01124173, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00076723, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.3003780480341702, + "language_loss": 0.80337167, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82583755, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.6146061420440674 + }, + { + "auxiliary_loss_clip": 0.0106547, + "auxiliary_loss_mlp": 0.01124345, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00055814, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 2.0424865218568775, + "language_loss": 0.77214289, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79404104, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.7826273441314697 + }, + { + "auxiliary_loss_clip": 0.01170103, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_clip": 1.00219071, + "balance_loss_mlp": 1.00063252, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.9530804249749456, + "language_loss": 0.68015963, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70310771, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.5517935752868652 + }, + { + "auxiliary_loss_clip": 0.0115516, + "auxiliary_loss_mlp": 0.01124788, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00080991, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.6880448355045874, + "language_loss": 0.77608806, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.79888755, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.531710147857666 + }, + { + "auxiliary_loss_clip": 0.01153595, + "auxiliary_loss_mlp": 0.01125084, + "balance_loss_clip": 1.00217068, + "balance_loss_mlp": 1.00091541, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.7228598691770196, + "language_loss": 0.82714236, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84992915, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.5788440704345703 + }, + { + "auxiliary_loss_clip": 0.01170175, + "auxiliary_loss_mlp": 0.01124944, + "balance_loss_clip": 1.0021621, + "balance_loss_mlp": 1.00068069, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 1.8838598521507668, + "language_loss": 0.736018, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75896919, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.4874789714813232 + }, + { + "auxiliary_loss_clip": 0.01121736, + "auxiliary_loss_mlp": 0.01125413, + "balance_loss_clip": 1.0020287, + "balance_loss_mlp": 1.00067282, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.41980126727377, + "language_loss": 0.86382067, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88629216, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.6533350944519043 + }, + { + "auxiliary_loss_clip": 0.01136522, + "auxiliary_loss_mlp": 0.01124116, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00090122, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4355151084376208, + "language_loss": 0.72248709, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74509346, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.5981218814849854 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.00748567, + "balance_loss_clip": 1.00215507, + "balance_loss_mlp": 1.00204265, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 2.0383633733854722, + "language_loss": 0.82221174, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84094119, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.6437642574310303 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.0000447, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7946397435597045, + "language_loss": 0.60404873, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62661064, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.0759470462799072 + }, + { + "auxiliary_loss_clip": 0.01170336, + "auxiliary_loss_mlp": 0.01125202, + "balance_loss_clip": 1.00222743, + "balance_loss_mlp": 1.00074792, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 3.591362827251673, + "language_loss": 0.7698046, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79276001, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.486767053604126 + }, + { + "auxiliary_loss_clip": 0.0115369, + "auxiliary_loss_mlp": 0.01125572, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.000736, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 1.7128429304839736, + "language_loss": 0.84326792, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86606055, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.01119918, + "auxiliary_loss_mlp": 0.01125492, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00075102, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 1.989736306056827, + "language_loss": 0.8313067, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85376084, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.6353766918182373 + }, + { + "auxiliary_loss_clip": 0.01136043, + "auxiliary_loss_mlp": 0.00748563, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00201368, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 2.040478004739171, + "language_loss": 0.80107701, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.81992316, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.6112747192382812 + }, + { + "auxiliary_loss_clip": 0.01170192, + "auxiliary_loss_mlp": 0.01125441, + "balance_loss_clip": 1.00209153, + "balance_loss_mlp": 1.0007962, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.8694170203520615, + "language_loss": 0.70380414, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72676051, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.4782493114471436 + }, + { + "auxiliary_loss_clip": 0.01155244, + "auxiliary_loss_mlp": 0.01125346, + "balance_loss_clip": 1.00224638, + "balance_loss_mlp": 1.00070119, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.581501938372666, + "language_loss": 0.76264286, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78544873, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.585879325866699 + }, + { + "auxiliary_loss_clip": 0.01153478, + "auxiliary_loss_mlp": 0.01124673, + "balance_loss_clip": 1.00217092, + "balance_loss_mlp": 1.00088644, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 2.221416529824989, + "language_loss": 0.72334194, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.74612343, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.5376031398773193 + }, + { + "auxiliary_loss_clip": 0.01137934, + "auxiliary_loss_mlp": 0.01124961, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.00098348, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.9995439166872857, + "language_loss": 0.66425735, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68688631, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.601106643676758 + }, + { + "auxiliary_loss_clip": 0.01138434, + "auxiliary_loss_mlp": 0.00748674, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00219536, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4738510744159723, + "language_loss": 0.78499389, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80386496, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.681403875350952 + }, + { + "auxiliary_loss_clip": 0.01170218, + "auxiliary_loss_mlp": 0.0112476, + "balance_loss_clip": 1.0021596, + "balance_loss_mlp": 1.00068665, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 2.0583562220908065, + "language_loss": 0.74911189, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77206159, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.51682710647583 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01124117, + "balance_loss_clip": 1.0022254, + "balance_loss_mlp": 1.00080729, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.3044533823366122, + "language_loss": 0.72612357, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74889815, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.5802700519561768 + }, + { + "auxiliary_loss_clip": 0.01121061, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00058317, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.975356360968282, + "language_loss": 0.80015963, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82261777, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.6516807079315186 + }, + { + "auxiliary_loss_clip": 0.01167029, + "auxiliary_loss_mlp": 0.01106082, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00003326, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9156217307247799, + "language_loss": 0.63414496, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65687609, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 3.0108683109283447 + }, + { + "auxiliary_loss_clip": 0.01170051, + "auxiliary_loss_mlp": 0.01125479, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00054789, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 4.134836140459044, + "language_loss": 0.72593278, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74888802, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 3.938882827758789 + }, + { + "auxiliary_loss_clip": 0.01170266, + "auxiliary_loss_mlp": 0.01124893, + "balance_loss_clip": 1.00230229, + "balance_loss_mlp": 1.00062943, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 1.8944323175531614, + "language_loss": 0.79225588, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81520748, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.5143604278564453 + }, + { + "auxiliary_loss_clip": 0.01153007, + "auxiliary_loss_mlp": 0.0112474, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00066757, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.698810165490124, + "language_loss": 0.70260429, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72538173, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 2.572591781616211 + }, + { + "auxiliary_loss_clip": 0.01170094, + "auxiliary_loss_mlp": 0.01124857, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00068855, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.4103263296341235, + "language_loss": 0.71190012, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73484969, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.546670913696289 + }, + { + "auxiliary_loss_clip": 0.01127464, + "auxiliary_loss_mlp": 0.01125276, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00082135, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5319732299188527, + "language_loss": 0.81577158, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83829904, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.6406052112579346 + }, + { + "auxiliary_loss_clip": 0.01153292, + "auxiliary_loss_mlp": 0.01124532, + "balance_loss_clip": 1.00219917, + "balance_loss_mlp": 1.00074542, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.350437759628922, + "language_loss": 0.90738535, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93016356, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 3.9500722885131836 + }, + { + "auxiliary_loss_clip": 0.011544, + "auxiliary_loss_mlp": 0.00748528, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.00183678, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.696788708211136, + "language_loss": 0.64069408, + "learning_rate": 2.572376498508805e-06, + "loss": 0.65972334, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.5528955459594727 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01124325, + "balance_loss_clip": 1.00199592, + "balance_loss_mlp": 1.00063336, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.475309442636253, + "language_loss": 0.73550129, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75795627, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 2.6247129440307617 + }, + { + "auxiliary_loss_clip": 0.01139886, + "auxiliary_loss_mlp": 0.01125643, + "balance_loss_clip": 1.00209701, + "balance_loss_mlp": 1.00080669, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.11861148519014, + "language_loss": 0.79052532, + "learning_rate": 2.571630111462766e-06, + "loss": 0.81318063, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.6153647899627686 + }, + { + "auxiliary_loss_clip": 0.01138064, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00069475, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.9277392911756874, + "language_loss": 0.73172128, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75433528, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.5974974632263184 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01124343, + "balance_loss_clip": 1.00222921, + "balance_loss_mlp": 1.00084162, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.780499896627561, + "language_loss": 0.79878205, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.82138997, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 5.429187297821045 + }, + { + "auxiliary_loss_clip": 0.01153572, + "auxiliary_loss_mlp": 0.01124868, + "balance_loss_clip": 1.00229669, + "balance_loss_mlp": 1.00089061, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4933446988543069, + "language_loss": 0.7217226, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74450701, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 2.8052175045013428 + }, + { + "auxiliary_loss_clip": 0.01169951, + "auxiliary_loss_mlp": 0.01123744, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00062478, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 1.984918677337742, + "language_loss": 0.80305588, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82599288, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.5221340656280518 + }, + { + "auxiliary_loss_clip": 0.01143938, + "auxiliary_loss_mlp": 0.01123567, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.0006386, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.601936837485548, + "language_loss": 0.81781769, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.84049273, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.57991623878479 + }, + { + "auxiliary_loss_clip": 0.01153233, + "auxiliary_loss_mlp": 0.01124655, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00067782, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 1.6771034555341444, + "language_loss": 0.69466352, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71744245, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.568728446960449 + }, + { + "auxiliary_loss_clip": 0.01152114, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_clip": 1.00205398, + "balance_loss_mlp": 0.99992734, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8859721946943457, + "language_loss": 0.6711539, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69373488, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.2058160305023193 + }, + { + "auxiliary_loss_clip": 0.01153315, + "auxiliary_loss_mlp": 0.01124521, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00073373, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 1.9483459234382288, + "language_loss": 0.78351408, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80629241, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.5232625007629395 + }, + { + "auxiliary_loss_clip": 0.01159179, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_clip": 1.00220811, + "balance_loss_mlp": 1.00074768, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.766514725318627, + "language_loss": 0.76166803, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78451568, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.514538288116455 + }, + { + "auxiliary_loss_clip": 0.01138145, + "auxiliary_loss_mlp": 0.01125029, + "balance_loss_clip": 1.00200403, + "balance_loss_mlp": 1.0007658, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 1.8934712173158306, + "language_loss": 0.80120718, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82383889, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.570498466491699 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.01124609, + "balance_loss_clip": 1.00210214, + "balance_loss_mlp": 1.00053608, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.7786903227111137, + "language_loss": 0.65972388, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68235296, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.607567071914673 + }, + { + "auxiliary_loss_clip": 0.01111561, + "auxiliary_loss_mlp": 0.01124926, + "balance_loss_clip": 1.00209439, + "balance_loss_mlp": 1.00075746, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.8701597413174291, + "language_loss": 0.68045658, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70282143, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.715245246887207 + }, + { + "auxiliary_loss_clip": 0.01104479, + "auxiliary_loss_mlp": 0.01124989, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00082099, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 3.689994867812665, + "language_loss": 0.73810321, + "learning_rate": 2.566776487287525e-06, + "loss": 0.76039791, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.662785053253174 + }, + { + "auxiliary_loss_clip": 0.01138761, + "auxiliary_loss_mlp": 0.01125339, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.00088453, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.9562097185060638, + "language_loss": 0.7507962, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.7734372, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.6371545791625977 + }, + { + "auxiliary_loss_clip": 0.01105379, + "auxiliary_loss_mlp": 0.01123178, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00063097, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.1211136350341038, + "language_loss": 0.82538259, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84766817, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.667783498764038 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01125864, + "balance_loss_clip": 1.0020479, + "balance_loss_mlp": 1.0007416, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.4762672058706519, + "language_loss": 0.73462743, + "learning_rate": 2.565655903224038e-06, + "loss": 0.75727153, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.6671135425567627 + }, + { + "auxiliary_loss_clip": 0.01155153, + "auxiliary_loss_mlp": 0.01125133, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00096416, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.167472269013592, + "language_loss": 0.70253336, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72533619, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.5906333923339844 + }, + { + "auxiliary_loss_clip": 0.01119271, + "auxiliary_loss_mlp": 0.0112456, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00077343, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.5407487198509255, + "language_loss": 0.81974071, + "learning_rate": 2.564908739909464e-06, + "loss": 0.84217906, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.6412711143493652 + }, + { + "auxiliary_loss_clip": 0.01170199, + "auxiliary_loss_mlp": 0.01125028, + "balance_loss_clip": 1.00216985, + "balance_loss_mlp": 1.00085986, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.9792441384035133, + "language_loss": 0.80627346, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82922578, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.505195140838623 + }, + { + "auxiliary_loss_clip": 0.01153453, + "auxiliary_loss_mlp": 0.01124593, + "balance_loss_clip": 1.00210679, + "balance_loss_mlp": 1.00061512, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.742111913452774, + "language_loss": 0.65594602, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67872643, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.622744083404541 + }, + { + "auxiliary_loss_clip": 0.01142277, + "auxiliary_loss_mlp": 0.01125045, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.0005908, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.9270878159904883, + "language_loss": 0.74526978, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76794302, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.6413843631744385 + }, + { + "auxiliary_loss_clip": 0.01154216, + "auxiliary_loss_mlp": 0.01124045, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00054431, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7710485271035543, + "language_loss": 0.74868602, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77146864, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.5816380977630615 + }, + { + "auxiliary_loss_clip": 0.01138286, + "auxiliary_loss_mlp": 0.01125465, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00081968, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.0538854774184614, + "language_loss": 0.82719308, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.84983051, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.6029863357543945 + }, + { + "auxiliary_loss_clip": 0.0113766, + "auxiliary_loss_mlp": 0.01125007, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00083828, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.414025477394406, + "language_loss": 0.82286441, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84549111, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.6535189151763916 + }, + { + "auxiliary_loss_clip": 0.01170176, + "auxiliary_loss_mlp": 0.01125281, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.00073075, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8045345054549753, + "language_loss": 0.72866184, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75161636, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.493868827819824 + }, + { + "auxiliary_loss_clip": 0.01155007, + "auxiliary_loss_mlp": 0.01124663, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00068545, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 2.0911142648268344, + "language_loss": 0.83134675, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.8541435, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.5649595260620117 + }, + { + "auxiliary_loss_clip": 0.01139825, + "auxiliary_loss_mlp": 0.01125953, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.0007354, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 1.878119350116443, + "language_loss": 0.73494387, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75760162, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.5621485710144043 + }, + { + "auxiliary_loss_clip": 0.01154952, + "auxiliary_loss_mlp": 0.01123991, + "balance_loss_clip": 1.00213242, + "balance_loss_mlp": 1.0006814, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.0953734233467154, + "language_loss": 0.74639946, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.76918888, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.6428608894348145 + }, + { + "auxiliary_loss_clip": 0.01170172, + "auxiliary_loss_mlp": 0.01125398, + "balance_loss_clip": 1.00221229, + "balance_loss_mlp": 1.00075245, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.8566997325986965, + "language_loss": 0.77066422, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79361987, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.4862570762634277 + }, + { + "auxiliary_loss_clip": 0.01137521, + "auxiliary_loss_mlp": 0.01124459, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.0006721, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.647986105662187, + "language_loss": 0.79889202, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82151186, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 4.054590225219727 + }, + { + "auxiliary_loss_clip": 0.01106793, + "auxiliary_loss_mlp": 0.01124427, + "balance_loss_clip": 1.00191891, + "balance_loss_mlp": 1.00083137, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.6244622365718269, + "language_loss": 0.68017411, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70248628, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.7405049800872803 + }, + { + "auxiliary_loss_clip": 0.011422, + "auxiliary_loss_mlp": 0.01124477, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00068974, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.695569739066524, + "language_loss": 0.71438205, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.7370488, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.5786828994750977 + }, + { + "auxiliary_loss_clip": 0.01154975, + "auxiliary_loss_mlp": 0.01124995, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00063562, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 2.402570423125528, + "language_loss": 0.64317626, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66597593, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.5954859256744385 + }, + { + "auxiliary_loss_clip": 0.01169959, + "auxiliary_loss_mlp": 0.00748551, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 1.00198293, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6568290754236263, + "language_loss": 0.76352566, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78271073, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.5414834022521973 + }, + { + "auxiliary_loss_clip": 0.01121315, + "auxiliary_loss_mlp": 0.01124314, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00081336, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.683152826124584, + "language_loss": 0.72898161, + "learning_rate": 2.558554403622845e-06, + "loss": 0.7514379, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 4.04108738899231 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01124455, + "balance_loss_clip": 1.00207186, + "balance_loss_mlp": 1.00085926, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.6025471187069857, + "language_loss": 0.71535724, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73798382, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.6390137672424316 + }, + { + "auxiliary_loss_clip": 0.01152975, + "auxiliary_loss_mlp": 0.01125506, + "balance_loss_clip": 1.00213325, + "balance_loss_mlp": 1.00095654, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.570215565361113, + "language_loss": 0.61778802, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64057285, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.5630640983581543 + }, + { + "auxiliary_loss_clip": 0.01153709, + "auxiliary_loss_mlp": 0.01125964, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.0009377, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.6028881432172786, + "language_loss": 0.64993787, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.67273468, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5699474811553955 + }, + { + "auxiliary_loss_clip": 0.01138654, + "auxiliary_loss_mlp": 0.01124464, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00067663, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.4864148388117746, + "language_loss": 0.73853236, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76116359, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 3.944838762283325 + }, + { + "auxiliary_loss_clip": 0.01139703, + "auxiliary_loss_mlp": 0.01123267, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00072026, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.8673275020264573, + "language_loss": 0.69119215, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71382189, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 4.098992824554443 + }, + { + "auxiliary_loss_clip": 0.01138382, + "auxiliary_loss_mlp": 0.01124684, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00080156, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.288062719757899, + "language_loss": 0.70147765, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72410834, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.5572361946105957 + }, + { + "auxiliary_loss_clip": 0.01122982, + "auxiliary_loss_mlp": 0.01124354, + "balance_loss_clip": 1.00206375, + "balance_loss_mlp": 1.00094891, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.0128836960476035, + "language_loss": 0.74533296, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76780635, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.7312822341918945 + }, + { + "auxiliary_loss_clip": 0.01090655, + "auxiliary_loss_mlp": 0.01124828, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.0008502, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.943705587681791, + "language_loss": 0.74690562, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76906049, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.694279909133911 + }, + { + "auxiliary_loss_clip": 0.01136798, + "auxiliary_loss_mlp": 0.00748337, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00175846, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 3.290133427352432, + "language_loss": 0.76872784, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78757918, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.59201979637146 + }, + { + "auxiliary_loss_clip": 0.01137591, + "auxiliary_loss_mlp": 0.01123988, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00086868, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.179225924410789, + "language_loss": 0.8574084, + "learning_rate": 2.554813694924126e-06, + "loss": 0.88002419, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.5885732173919678 + }, + { + "auxiliary_loss_clip": 0.01106532, + "auxiliary_loss_mlp": 0.01123873, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00065827, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.8417709812434027, + "language_loss": 0.81651545, + "learning_rate": 2.554439508107921e-06, + "loss": 0.8388195, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.666398048400879 + }, + { + "auxiliary_loss_clip": 0.01120203, + "auxiliary_loss_mlp": 0.01124211, + "balance_loss_clip": 1.00224447, + "balance_loss_mlp": 1.00071001, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 2.112610086858344, + "language_loss": 0.80857062, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83101475, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.6770942211151123 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01124176, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00057983, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.8393768157478438, + "language_loss": 0.80157727, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82436907, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.5611987113952637 + }, + { + "auxiliary_loss_clip": 0.01170033, + "auxiliary_loss_mlp": 0.00748219, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00167441, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 2.0784562855485373, + "language_loss": 0.75342453, + "learning_rate": 2.553316821569659e-06, + "loss": 0.77260709, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.505533218383789 + }, + { + "auxiliary_loss_clip": 0.01153446, + "auxiliary_loss_mlp": 0.01124067, + "balance_loss_clip": 1.00214207, + "balance_loss_mlp": 1.00056577, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.633725712932423, + "language_loss": 0.8099606, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83273572, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.5767407417297363 + }, + { + "auxiliary_loss_clip": 0.01102361, + "auxiliary_loss_mlp": 0.01124171, + "balance_loss_clip": 1.0017128, + "balance_loss_mlp": 1.0007658, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.6480758835261315, + "language_loss": 0.7628752, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78514057, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.622321844100952 + }, + { + "auxiliary_loss_clip": 0.01104639, + "auxiliary_loss_mlp": 0.01124547, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00066471, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 2.018706879160165, + "language_loss": 0.74565893, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76795077, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.672281265258789 + }, + { + "auxiliary_loss_clip": 0.01152968, + "auxiliary_loss_mlp": 0.00748421, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00180697, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7039121545425495, + "language_loss": 0.77672875, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79574263, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.5797836780548096 + }, + { + "auxiliary_loss_clip": 0.01136763, + "auxiliary_loss_mlp": 0.01124485, + "balance_loss_clip": 1.00214112, + "balance_loss_mlp": 1.00088882, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.9211447279674572, + "language_loss": 0.73529243, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75790489, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.551668643951416 + }, + { + "auxiliary_loss_clip": 0.01138189, + "auxiliary_loss_mlp": 0.01123925, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00071049, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 3.1093294528726347, + "language_loss": 0.77368587, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79630703, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.5666120052337646 + }, + { + "auxiliary_loss_clip": 0.01121624, + "auxiliary_loss_mlp": 0.00748309, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00166702, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5115019899801139, + "language_loss": 0.78535819, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80405754, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.6830339431762695 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01123916, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00070107, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 2.0220718957302064, + "language_loss": 0.75324243, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77584898, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.5436365604400635 + }, + { + "auxiliary_loss_clip": 0.01154754, + "auxiliary_loss_mlp": 0.01123023, + "balance_loss_clip": 1.00208867, + "balance_loss_mlp": 1.00066614, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 2.0074440277485435, + "language_loss": 0.84003204, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86280978, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.5372140407562256 + }, + { + "auxiliary_loss_clip": 0.01091216, + "auxiliary_loss_mlp": 0.01124003, + "balance_loss_clip": 1.00194967, + "balance_loss_mlp": 1.00078797, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.902614837080498, + "language_loss": 0.75068128, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77283347, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.7592105865478516 + }, + { + "auxiliary_loss_clip": 0.01153292, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.000669, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.7353880730083, + "language_loss": 0.78781801, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81059361, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.5757925510406494 + }, + { + "auxiliary_loss_clip": 0.01170184, + "auxiliary_loss_mlp": 0.01124278, + "balance_loss_clip": 1.00224948, + "balance_loss_mlp": 1.00077701, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 2.408304499224775, + "language_loss": 0.76285267, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78579724, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.547510862350464 + }, + { + "auxiliary_loss_clip": 0.01135223, + "auxiliary_loss_mlp": 0.011056, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00031471, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7738075611729287, + "language_loss": 0.56164014, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58404839, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.043588161468506 + }, + { + "auxiliary_loss_clip": 0.01169872, + "auxiliary_loss_mlp": 0.00748192, + "balance_loss_clip": 1.00214362, + "balance_loss_mlp": 1.00165582, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6099835415507808, + "language_loss": 0.806692, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82587266, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.5076048374176025 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00041652, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.656198746859814, + "language_loss": 0.81880319, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.841573, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.524867296218872 + }, + { + "auxiliary_loss_clip": 0.01158793, + "auxiliary_loss_mlp": 0.01124564, + "balance_loss_clip": 1.00219107, + "balance_loss_mlp": 1.00077748, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.8129953115637385, + "language_loss": 0.86143589, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88426948, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.5602829456329346 + }, + { + "auxiliary_loss_clip": 0.01136142, + "auxiliary_loss_mlp": 0.01123002, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.00074077, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.9475770643840977, + "language_loss": 0.78096163, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.8035531, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.6093404293060303 + }, + { + "auxiliary_loss_clip": 0.01107884, + "auxiliary_loss_mlp": 0.01123583, + "balance_loss_clip": 1.00213754, + "balance_loss_mlp": 1.00075018, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 1.806512019817769, + "language_loss": 0.76592916, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78824389, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.6446573734283447 + }, + { + "auxiliary_loss_clip": 0.01124496, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00056171, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.6740515920073278, + "language_loss": 0.73538077, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75786161, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 4.244471788406372 + }, + { + "auxiliary_loss_clip": 0.01153309, + "auxiliary_loss_mlp": 0.01123318, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00067556, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.9608173662302175, + "language_loss": 0.79094708, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81371331, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.555170774459839 + }, + { + "auxiliary_loss_clip": 0.01153111, + "auxiliary_loss_mlp": 0.01123132, + "balance_loss_clip": 1.0019964, + "balance_loss_mlp": 1.00077534, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.18147381511726, + "language_loss": 0.8293879, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85215032, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.541377305984497 + }, + { + "auxiliary_loss_clip": 0.01153595, + "auxiliary_loss_mlp": 0.01124268, + "balance_loss_clip": 1.00220895, + "balance_loss_mlp": 1.00067186, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.7878414764112744, + "language_loss": 0.87077814, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89355677, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.561772584915161 + }, + { + "auxiliary_loss_clip": 0.01143909, + "auxiliary_loss_mlp": 0.01123401, + "balance_loss_clip": 1.00217426, + "balance_loss_mlp": 1.00066292, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5743203599410232, + "language_loss": 0.77739441, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80006754, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 4.033214330673218 + }, + { + "auxiliary_loss_clip": 0.01122858, + "auxiliary_loss_mlp": 0.01123437, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00069952, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 3.3030313023992224, + "language_loss": 0.7970413, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81950426, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.6264848709106445 + }, + { + "auxiliary_loss_clip": 0.01152912, + "auxiliary_loss_mlp": 0.01124116, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00080562, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.6297470098946412, + "language_loss": 0.74700755, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76977789, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.506481647491455 + }, + { + "auxiliary_loss_clip": 0.01123761, + "auxiliary_loss_mlp": 0.01125218, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.0008589, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 2.2245399324111346, + "language_loss": 0.70169538, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72418523, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.6008358001708984 + }, + { + "auxiliary_loss_clip": 0.01143481, + "auxiliary_loss_mlp": 0.01123639, + "balance_loss_clip": 1.00196564, + "balance_loss_mlp": 1.00080562, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.5578356949182377, + "language_loss": 0.7128011, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73547232, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 4.0653932094573975 + }, + { + "auxiliary_loss_clip": 0.01154771, + "auxiliary_loss_mlp": 0.01123638, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00061369, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.8152891534046784, + "language_loss": 0.78612828, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80891234, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.528050422668457 + }, + { + "auxiliary_loss_clip": 0.01138292, + "auxiliary_loss_mlp": 0.0112333, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00068772, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.5926297105211122, + "language_loss": 0.78929359, + "learning_rate": 2.542454506558389e-06, + "loss": 0.81190979, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 3.9555342197418213 + }, + { + "auxiliary_loss_clip": 0.01136457, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_clip": 1.00176799, + "balance_loss_mlp": 1.00062501, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7242575503462212, + "language_loss": 0.88815194, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.91074157, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.6219699382781982 + }, + { + "auxiliary_loss_clip": 0.0117006, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_clip": 1.00211787, + "balance_loss_mlp": 1.00064564, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.9470149537319843, + "language_loss": 0.83136916, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85432076, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.60552716255188 + }, + { + "auxiliary_loss_clip": 0.01170078, + "auxiliary_loss_mlp": 0.01124793, + "balance_loss_clip": 1.00215554, + "balance_loss_mlp": 1.00052881, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.817342048429739, + "language_loss": 0.7176795, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74062824, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.560525417327881 + }, + { + "auxiliary_loss_clip": 0.01153063, + "auxiliary_loss_mlp": 0.01124137, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.0006361, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.1031171278537286, + "language_loss": 0.82937402, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85214603, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.540165424346924 + }, + { + "auxiliary_loss_clip": 0.01138072, + "auxiliary_loss_mlp": 0.0112404, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00063467, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.374619098604588, + "language_loss": 0.82771564, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85033679, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.5654892921447754 + }, + { + "auxiliary_loss_clip": 0.01153406, + "auxiliary_loss_mlp": 0.01125928, + "balance_loss_clip": 1.00212753, + "balance_loss_mlp": 1.00071049, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 5.198284347986797, + "language_loss": 0.77198362, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79477692, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.55476450920105 + }, + { + "auxiliary_loss_clip": 0.01154955, + "auxiliary_loss_mlp": 0.01124185, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.00068378, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 1.7654259637635101, + "language_loss": 0.72637916, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.74917054, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.550274610519409 + }, + { + "auxiliary_loss_clip": 0.01118075, + "auxiliary_loss_mlp": 0.00746678, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00077558, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.8019384366137142, + "language_loss": 0.58982134, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60846889, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.0830492973327637 + }, + { + "auxiliary_loss_clip": 0.01138207, + "auxiliary_loss_mlp": 0.01123743, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00062382, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.789426923315137, + "language_loss": 0.79109263, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81371212, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.6525862216949463 + }, + { + "auxiliary_loss_clip": 0.01170031, + "auxiliary_loss_mlp": 0.01124738, + "balance_loss_clip": 1.00208819, + "balance_loss_mlp": 1.0008558, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.863354599705892, + "language_loss": 0.67698586, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69993353, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.584416627883911 + }, + { + "auxiliary_loss_clip": 0.01136687, + "auxiliary_loss_mlp": 0.00748517, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00184107, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 4.29571556360922, + "language_loss": 0.74878019, + "learning_rate": 2.538329773967034e-06, + "loss": 0.76763225, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.564920425415039 + }, + { + "auxiliary_loss_clip": 0.01153229, + "auxiliary_loss_mlp": 0.01123635, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00061071, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 2.1207996357776877, + "language_loss": 0.71660852, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73937714, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.5961782932281494 + }, + { + "auxiliary_loss_clip": 0.01136271, + "auxiliary_loss_mlp": 0.00748389, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00183249, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.5332872613241773, + "language_loss": 0.78445578, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80330241, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.60795259475708 + }, + { + "auxiliary_loss_clip": 0.01135924, + "auxiliary_loss_mlp": 0.01124208, + "balance_loss_clip": 1.0019244, + "balance_loss_mlp": 1.00080216, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.0124341974408377, + "language_loss": 0.82043928, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84304059, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.5805046558380127 + }, + { + "auxiliary_loss_clip": 0.01135541, + "auxiliary_loss_mlp": 0.01105272, + "balance_loss_clip": 1.00219727, + "balance_loss_mlp": 0.99998629, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6746444521576127, + "language_loss": 0.60767758, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.63008571, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.308882713317871 + }, + { + "auxiliary_loss_clip": 0.01169863, + "auxiliary_loss_mlp": 0.01123544, + "balance_loss_clip": 1.00208557, + "balance_loss_mlp": 1.00061512, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.5874864422272605, + "language_loss": 0.76212198, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78505611, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.516677141189575 + }, + { + "auxiliary_loss_clip": 0.01153104, + "auxiliary_loss_mlp": 0.01123624, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00069547, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.5150394705076666, + "language_loss": 0.77343714, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79620445, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.571408987045288 + }, + { + "auxiliary_loss_clip": 0.01137972, + "auxiliary_loss_mlp": 0.01124689, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00061548, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.6267352301730889, + "language_loss": 0.76427931, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78690588, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.5779030323028564 + }, + { + "auxiliary_loss_clip": 0.0116989, + "auxiliary_loss_mlp": 0.00748541, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00197959, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.5776572961347344, + "language_loss": 0.76947999, + "learning_rate": 2.5353284159381e-06, + "loss": 0.78866422, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.516209125518799 + }, + { + "auxiliary_loss_clip": 0.0117, + "auxiliary_loss_mlp": 0.01123749, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00063014, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.4495911917543938, + "language_loss": 0.82605839, + "learning_rate": 2.534953154686407e-06, + "loss": 0.8489958, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.473604917526245 + }, + { + "auxiliary_loss_clip": 0.01123228, + "auxiliary_loss_mlp": 0.01124416, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00072408, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.34177448370956, + "language_loss": 0.74886525, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77134162, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.57965350151062 + }, + { + "auxiliary_loss_clip": 0.01154844, + "auxiliary_loss_mlp": 0.01123436, + "balance_loss_clip": 1.00197959, + "balance_loss_mlp": 1.00060296, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.9513663697809305, + "language_loss": 0.73336107, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75614393, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.5555081367492676 + }, + { + "auxiliary_loss_clip": 0.01136699, + "auxiliary_loss_mlp": 0.01124845, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.000772, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 2.1533890367569213, + "language_loss": 0.80934578, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83196121, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.5791707038879395 + }, + { + "auxiliary_loss_clip": 0.01137799, + "auxiliary_loss_mlp": 0.01123272, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00081992, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4583987787579609, + "language_loss": 0.83985949, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86247027, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.6138429641723633 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01123305, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00066221, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.996736864215384, + "language_loss": 0.75117141, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77376735, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.56312894821167 + }, + { + "auxiliary_loss_clip": 0.01139637, + "auxiliary_loss_mlp": 0.00748758, + "balance_loss_clip": 1.00198519, + "balance_loss_mlp": 1.00216389, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 2.1325659014441167, + "language_loss": 0.81947213, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83835614, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.607363700866699 + }, + { + "auxiliary_loss_clip": 0.01141647, + "auxiliary_loss_mlp": 0.01123967, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00065684, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.5673591102773081, + "language_loss": 0.89144093, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91409707, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 4.12602162361145 + }, + { + "auxiliary_loss_clip": 0.011534, + "auxiliary_loss_mlp": 0.00748245, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00181127, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.5436273406985548, + "language_loss": 0.75752252, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77653897, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.5580215454101562 + }, + { + "auxiliary_loss_clip": 0.01153228, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00058222, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.5320232385794925, + "language_loss": 0.77442759, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79719591, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.636378288269043 + }, + { + "auxiliary_loss_clip": 0.01138128, + "auxiliary_loss_mlp": 0.01123193, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.000741, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.6930431633370142, + "language_loss": 0.73559248, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75820577, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.7014338970184326 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00077617, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.0857600357942556, + "language_loss": 0.74876159, + "learning_rate": 2.530823945207421e-06, + "loss": 0.77139258, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.5898139476776123 + }, + { + "auxiliary_loss_clip": 0.01119626, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00070465, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 3.5743422036138757, + "language_loss": 0.76426804, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78670537, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 4.11487603187561 + }, + { + "auxiliary_loss_clip": 0.01119884, + "auxiliary_loss_mlp": 0.01105436, + "balance_loss_clip": 1.00167465, + "balance_loss_mlp": 1.00014997, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8630975992375421, + "language_loss": 0.68141949, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70367265, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.2020089626312256 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01122867, + "balance_loss_clip": 1.0021466, + "balance_loss_mlp": 1.00060642, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 2.0705540904843995, + "language_loss": 0.77770299, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80031157, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.561131477355957 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.0009762, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.7331964300344753, + "language_loss": 0.71417749, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73651105, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.6870133876800537 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01122486, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00060606, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.720353337229627, + "language_loss": 0.79677737, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81938612, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 4.007814645767212 + }, + { + "auxiliary_loss_clip": 0.01103783, + "auxiliary_loss_mlp": 0.01123077, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00072026, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.4767892474013768, + "language_loss": 0.74749035, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76975894, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.679764747619629 + }, + { + "auxiliary_loss_clip": 0.01107236, + "auxiliary_loss_mlp": 0.01124037, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00082219, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 7.515523132128055, + "language_loss": 0.792238, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81455076, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 4.014713287353516 + }, + { + "auxiliary_loss_clip": 0.01138967, + "auxiliary_loss_mlp": 0.01122669, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00078905, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7573608620466243, + "language_loss": 0.75751209, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78012848, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.572953701019287 + }, + { + "auxiliary_loss_clip": 0.01169851, + "auxiliary_loss_mlp": 0.01123964, + "balance_loss_clip": 1.00211048, + "balance_loss_mlp": 1.00074959, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 1.6994346081432927, + "language_loss": 0.59497267, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.6179108, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.5295748710632324 + }, + { + "auxiliary_loss_clip": 0.01138015, + "auxiliary_loss_mlp": 0.01124188, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00068688, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.010623516029561, + "language_loss": 0.65332943, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67595148, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.574916362762451 + }, + { + "auxiliary_loss_clip": 0.01169896, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.00075734, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.016982537124992, + "language_loss": 0.72475833, + "learning_rate": 2.526692300132797e-06, + "loss": 0.747697, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.510226011276245 + }, + { + "auxiliary_loss_clip": 0.01153114, + "auxiliary_loss_mlp": 0.01123602, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.00095987, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.7502639050775533, + "language_loss": 0.72747946, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75024664, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.5806350708007812 + }, + { + "auxiliary_loss_clip": 0.01121117, + "auxiliary_loss_mlp": 0.01122882, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00052524, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.701755069618565, + "language_loss": 0.81229019, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83473015, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.671489715576172 + }, + { + "auxiliary_loss_clip": 0.0113936, + "auxiliary_loss_mlp": 0.01123867, + "balance_loss_clip": 1.00210989, + "balance_loss_mlp": 1.00074768, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 1.9563915935711262, + "language_loss": 0.68349463, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70612693, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.5847573280334473 + }, + { + "auxiliary_loss_clip": 0.01136433, + "auxiliary_loss_mlp": 0.00748388, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00171983, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 4.676911076176038, + "language_loss": 0.87100589, + "learning_rate": 2.525189283578157e-06, + "loss": 0.88985413, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.5884523391723633 + }, + { + "auxiliary_loss_clip": 0.01090075, + "auxiliary_loss_mlp": 0.01124631, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00084436, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 1.774439198069867, + "language_loss": 0.64502347, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66717052, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.731349468231201 + }, + { + "auxiliary_loss_clip": 0.01104728, + "auxiliary_loss_mlp": 0.01122529, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00055361, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.8170365267750996, + "language_loss": 0.8219257, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84419823, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.6646764278411865 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01123391, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00074816, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 2.2296620491284895, + "language_loss": 0.81342125, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83586949, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.63529896736145 + }, + { + "auxiliary_loss_clip": 0.01136253, + "auxiliary_loss_mlp": 0.01122876, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00051928, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.8955355870220516, + "language_loss": 0.73904324, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.76163453, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.554527997970581 + }, + { + "auxiliary_loss_clip": 0.01169782, + "auxiliary_loss_mlp": 0.00748328, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00184095, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.7032433167463026, + "language_loss": 0.75194907, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.7711302, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.559542179107666 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01122787, + "balance_loss_clip": 1.00169539, + "balance_loss_mlp": 1.00071633, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 1.9569723993231931, + "language_loss": 0.78401136, + "learning_rate": 2.522934161574342e-06, + "loss": 0.8063218, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.659852981567383 + }, + { + "auxiliary_loss_clip": 0.01121757, + "auxiliary_loss_mlp": 0.01123353, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00061488, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.7170164383981714, + "language_loss": 0.80994421, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83239532, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.5948173999786377 + }, + { + "auxiliary_loss_clip": 0.01137912, + "auxiliary_loss_mlp": 0.01123623, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.0007894, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.035391465833523, + "language_loss": 0.69636285, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.71897817, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.582733154296875 + }, + { + "auxiliary_loss_clip": 0.01154748, + "auxiliary_loss_mlp": 0.01123981, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00076675, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.4790434379890967, + "language_loss": 0.81343156, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83621883, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.5674593448638916 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.01123417, + "balance_loss_clip": 1.00202, + "balance_loss_mlp": 1.0007745, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 2.210115301891536, + "language_loss": 0.82216716, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84476507, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.5830202102661133 + }, + { + "auxiliary_loss_clip": 0.01154684, + "auxiliary_loss_mlp": 0.01123161, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.00061405, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.955044111736405, + "language_loss": 0.74588251, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76866096, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.5283589363098145 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01123297, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00074947, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.7400195445789695, + "language_loss": 0.7651509, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78780037, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.5401391983032227 + }, + { + "auxiliary_loss_clip": 0.01153335, + "auxiliary_loss_mlp": 0.01123578, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.0007453, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.569564130569241, + "language_loss": 0.6499539, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67272305, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.535831928253174 + }, + { + "auxiliary_loss_clip": 0.01137564, + "auxiliary_loss_mlp": 0.01122746, + "balance_loss_clip": 1.00197566, + "balance_loss_mlp": 1.00067532, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5753294934716555, + "language_loss": 0.71792328, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74052632, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.6293301582336426 + }, + { + "auxiliary_loss_clip": 0.01137912, + "auxiliary_loss_mlp": 0.01123565, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00073171, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.6723665527344735, + "language_loss": 0.74595308, + "learning_rate": 2.519550141025255e-06, + "loss": 0.76856792, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.554076910018921 + }, + { + "auxiliary_loss_clip": 0.01138355, + "auxiliary_loss_mlp": 0.01124738, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00076079, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.2641050795350623, + "language_loss": 0.75540376, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77803463, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.5549423694610596 + }, + { + "auxiliary_loss_clip": 0.0112311, + "auxiliary_loss_mlp": 0.01123222, + "balance_loss_clip": 1.0020808, + "balance_loss_mlp": 1.00077033, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.8633471620028883, + "language_loss": 0.73898172, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76144505, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.5958285331726074 + }, + { + "auxiliary_loss_clip": 0.0114201, + "auxiliary_loss_mlp": 0.01123726, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00060689, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 2.7038813193817806, + "language_loss": 0.68673921, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70939654, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.577951669692993 + }, + { + "auxiliary_loss_clip": 0.01138389, + "auxiliary_loss_mlp": 0.01122906, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00074065, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.6293466551116484, + "language_loss": 0.77385348, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79646647, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 4.0786707401275635 + }, + { + "auxiliary_loss_clip": 0.01088802, + "auxiliary_loss_mlp": 0.01123214, + "balance_loss_clip": 1.00179732, + "balance_loss_mlp": 1.00076222, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.768288439839026, + "language_loss": 0.69338298, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.7155031, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.701212167739868 + }, + { + "auxiliary_loss_clip": 0.01154878, + "auxiliary_loss_mlp": 0.0112317, + "balance_loss_clip": 1.00206006, + "balance_loss_mlp": 1.00071788, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.8250881281863367, + "language_loss": 0.65128762, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67406809, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.554457902908325 + }, + { + "auxiliary_loss_clip": 0.01120849, + "auxiliary_loss_mlp": 0.01122962, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00060523, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.312102938285195, + "language_loss": 0.73136449, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.7538026, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.5840096473693848 + }, + { + "auxiliary_loss_clip": 0.01169849, + "auxiliary_loss_mlp": 0.01123915, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00070024, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.9091442887003685, + "language_loss": 0.93734634, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96028399, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.5538442134857178 + }, + { + "auxiliary_loss_clip": 0.01122851, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00074971, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.5281389018936697, + "language_loss": 0.6112653, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.6337201, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 4.096216440200806 + }, + { + "auxiliary_loss_clip": 0.01140014, + "auxiliary_loss_mlp": 0.00748375, + "balance_loss_clip": 1.00213659, + "balance_loss_mlp": 1.00180316, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.750394982526354, + "language_loss": 0.77290148, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79178536, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 2.5824460983276367 + }, + { + "auxiliary_loss_clip": 0.01153938, + "auxiliary_loss_mlp": 0.01122599, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00071883, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.821718821014303, + "language_loss": 0.84628165, + "learning_rate": 2.515411949802964e-06, + "loss": 0.86904705, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.5496747493743896 + }, + { + "auxiliary_loss_clip": 0.01154745, + "auxiliary_loss_mlp": 0.01123151, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00098515, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1.885392588181546, + "language_loss": 0.76305115, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78583014, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.5692126750946045 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01122827, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00056553, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.7339915805461508, + "language_loss": 0.80712771, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.8294065, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 4.185169458389282 + }, + { + "auxiliary_loss_clip": 0.0115487, + "auxiliary_loss_mlp": 0.01123494, + "balance_loss_clip": 1.00202668, + "balance_loss_mlp": 1.00085163, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.7987619652319298, + "language_loss": 0.82041287, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84319651, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 2.5749154090881348 + }, + { + "auxiliary_loss_clip": 0.01155197, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00086808, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.1355343022215694, + "language_loss": 0.77061182, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79340553, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 3.9220125675201416 + }, + { + "auxiliary_loss_clip": 0.01126593, + "auxiliary_loss_mlp": 0.01122562, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.0007776, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.5032043818757685, + "language_loss": 0.68640858, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70890009, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.656198263168335 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01124071, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00076127, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6449824355517884, + "language_loss": 0.7231425, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74561471, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.7284107208251953 + }, + { + "auxiliary_loss_clip": 0.01087784, + "auxiliary_loss_mlp": 0.01122948, + "balance_loss_clip": 1.0016613, + "balance_loss_mlp": 1.00078237, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.7778283334814755, + "language_loss": 0.74668294, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76879025, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.7704849243164062 + }, + { + "auxiliary_loss_clip": 0.01138368, + "auxiliary_loss_mlp": 0.01124119, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.0008086, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.4978589437501406, + "language_loss": 0.58772451, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61034936, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.6139914989471436 + }, + { + "auxiliary_loss_clip": 0.01108227, + "auxiliary_loss_mlp": 0.01123783, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00075901, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4006550588678435, + "language_loss": 0.7760157, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79833579, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.729316234588623 + }, + { + "auxiliary_loss_clip": 0.01169749, + "auxiliary_loss_mlp": 0.01122924, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00075793, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.7164596986664233, + "language_loss": 0.80909014, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83201683, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.5078418254852295 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01122868, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00079775, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.6465598121102594, + "language_loss": 0.63079619, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65357327, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.537177801132202 + }, + { + "auxiliary_loss_clip": 0.01119814, + "auxiliary_loss_mlp": 0.00748545, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00197351, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 4.518213194237946, + "language_loss": 0.8593061, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87798971, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.6668384075164795 + }, + { + "auxiliary_loss_clip": 0.01138105, + "auxiliary_loss_mlp": 0.01122847, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00058568, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.5124736212836252, + "language_loss": 0.72285366, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74546313, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.5963857173919678 + }, + { + "auxiliary_loss_clip": 0.01119579, + "auxiliary_loss_mlp": 0.01123433, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00060034, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 5.7472132781997365, + "language_loss": 0.81669325, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83912337, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.657881498336792 + }, + { + "auxiliary_loss_clip": 0.01119339, + "auxiliary_loss_mlp": 0.00748487, + "balance_loss_clip": 1.00174606, + "balance_loss_mlp": 1.00181437, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.540656036475656, + "language_loss": 0.78801244, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.80669069, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.5734448432922363 + }, + { + "auxiliary_loss_clip": 0.01139267, + "auxiliary_loss_mlp": 0.01123485, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00055611, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 3.412551172886647, + "language_loss": 0.6836527, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70628023, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.5810561180114746 + }, + { + "auxiliary_loss_clip": 0.01110411, + "auxiliary_loss_mlp": 0.0112294, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00067925, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6365572562098079, + "language_loss": 0.81431651, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83665001, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.6194467544555664 + }, + { + "auxiliary_loss_clip": 0.010894, + "auxiliary_loss_mlp": 0.01122622, + "balance_loss_clip": 1.0017488, + "balance_loss_mlp": 1.00064743, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.6966290986857917, + "language_loss": 0.73460186, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75672209, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.7188761234283447 + }, + { + "auxiliary_loss_clip": 0.01104954, + "auxiliary_loss_mlp": 0.01123352, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00070941, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.7812247594703985, + "language_loss": 0.76961076, + "learning_rate": 2.508258605639389e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.6838245391845703 + }, + { + "auxiliary_loss_clip": 0.01155007, + "auxiliary_loss_mlp": 0.01124166, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.00076079, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 5.234512202313073, + "language_loss": 0.85229069, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87508249, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.6074182987213135 + }, + { + "auxiliary_loss_clip": 0.01169744, + "auxiliary_loss_mlp": 0.01123529, + "balance_loss_clip": 1.00208008, + "balance_loss_mlp": 1.00088644, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 2.1515253649824193, + "language_loss": 0.72016096, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74309373, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01153086, + "auxiliary_loss_mlp": 0.01123029, + "balance_loss_clip": 1.00208044, + "balance_loss_mlp": 1.00057733, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.4162829621163673, + "language_loss": 0.87130439, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.8940655, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.587071180343628 + }, + { + "auxiliary_loss_clip": 0.01137237, + "auxiliary_loss_mlp": 0.01122911, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00084066, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.891204793710039, + "language_loss": 0.81919873, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84180021, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.5913047790527344 + }, + { + "auxiliary_loss_clip": 0.01153301, + "auxiliary_loss_mlp": 0.01123676, + "balance_loss_clip": 1.00223851, + "balance_loss_mlp": 1.00074792, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 6.124920611179848, + "language_loss": 0.84477836, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86754811, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.6012182235717773 + }, + { + "auxiliary_loss_clip": 0.01154982, + "auxiliary_loss_mlp": 0.01122136, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00082922, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.4835609689517042, + "language_loss": 0.69518733, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71795857, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.550203323364258 + }, + { + "auxiliary_loss_clip": 0.01137677, + "auxiliary_loss_mlp": 0.01122904, + "balance_loss_clip": 1.00198627, + "balance_loss_mlp": 1.00073838, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.6571303116458942, + "language_loss": 0.83575022, + "learning_rate": 2.505621403992348e-06, + "loss": 0.858356, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.5675933361053467 + }, + { + "auxiliary_loss_clip": 0.01152651, + "auxiliary_loss_mlp": 0.01123163, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00071096, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4976767816212304, + "language_loss": 0.70600343, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72876155, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.5386788845062256 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01122914, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.0007484, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.9787483250575784, + "language_loss": 0.81568855, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83827955, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.58483624458313 + }, + { + "auxiliary_loss_clip": 0.01169741, + "auxiliary_loss_mlp": 0.01123701, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00077236, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.7719108688951475, + "language_loss": 0.77498406, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79791844, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.506561040878296 + }, + { + "auxiliary_loss_clip": 0.01169701, + "auxiliary_loss_mlp": 0.01122584, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00051332, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.5343803650038121, + "language_loss": 0.76090324, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78382605, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 4.027318954467773 + }, + { + "auxiliary_loss_clip": 0.01152922, + "auxiliary_loss_mlp": 0.01122474, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00049901, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.810201888442365, + "language_loss": 0.7308833, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75363725, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.588534116744995 + }, + { + "auxiliary_loss_clip": 0.0113583, + "auxiliary_loss_mlp": 0.01123203, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00056052, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.799042716602617, + "language_loss": 0.76679468, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78938508, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.627418041229248 + }, + { + "auxiliary_loss_clip": 0.01133115, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00037313, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7374961732603368, + "language_loss": 0.56989282, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59228051, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.0987372398376465 + }, + { + "auxiliary_loss_clip": 0.01139272, + "auxiliary_loss_mlp": 0.01123694, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00057459, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.5379349139873884, + "language_loss": 0.7106775, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73330724, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.01105886, + "auxiliary_loss_mlp": 0.01123694, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.00067019, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 3.040423248920575, + "language_loss": 0.69652855, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71882439, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.655289649963379 + }, + { + "auxiliary_loss_clip": 0.01090576, + "auxiliary_loss_mlp": 0.01121452, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00071633, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.64905310887711, + "language_loss": 0.79777479, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81989503, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 4.110353708267212 + }, + { + "auxiliary_loss_clip": 0.01120582, + "auxiliary_loss_mlp": 0.01123459, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00100744, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.5823436416989745, + "language_loss": 0.75211906, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77455938, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 2.583369016647339 + }, + { + "auxiliary_loss_clip": 0.01106206, + "auxiliary_loss_mlp": 0.01122703, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00063252, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.205410416438031, + "language_loss": 0.61579394, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63808298, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.796995162963867 + }, + { + "auxiliary_loss_clip": 0.0113649, + "auxiliary_loss_mlp": 0.01122135, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00063658, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 1.7675839314962243, + "language_loss": 0.72828525, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75087154, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 4.00827956199646 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01122953, + "balance_loss_clip": 1.00190902, + "balance_loss_mlp": 1.00078702, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.0566325700285604, + "language_loss": 0.81547379, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83806407, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.6128880977630615 + }, + { + "auxiliary_loss_clip": 0.01169577, + "auxiliary_loss_mlp": 0.01122305, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00061631, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 2.2099035000637026, + "language_loss": 0.74546039, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76837921, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 3.9113786220550537 + }, + { + "auxiliary_loss_clip": 0.01169745, + "auxiliary_loss_mlp": 0.01122786, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00071549, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.052312072053158, + "language_loss": 0.79801774, + "learning_rate": 2.499589994531454e-06, + "loss": 0.820943, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.475548505783081 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01122559, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.0007751, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.8048194640610782, + "language_loss": 0.75048172, + "learning_rate": 2.499212869804237e-06, + "loss": 0.7730788, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.6046886444091797 + }, + { + "auxiliary_loss_clip": 0.01090287, + "auxiliary_loss_mlp": 0.01122749, + "balance_loss_clip": 1.00172949, + "balance_loss_mlp": 1.0006783, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.7834219507380102, + "language_loss": 0.79486841, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81699878, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.7340033054351807 + }, + { + "auxiliary_loss_clip": 0.0115019, + "auxiliary_loss_mlp": 0.01104814, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00029135, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.7026742591898284, + "language_loss": 0.54947603, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.57202607, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.1989989280700684 + }, + { + "auxiliary_loss_clip": 0.01169771, + "auxiliary_loss_mlp": 0.01123779, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.0008502, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.7543221088132488, + "language_loss": 0.6958006, + "learning_rate": 2.498081382098581e-06, + "loss": 0.71873611, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.5146610736846924 + }, + { + "auxiliary_loss_clip": 0.01139028, + "auxiliary_loss_mlp": 0.01123381, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00073874, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.7790907755705758, + "language_loss": 0.75455695, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77718103, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.748276472091675 + }, + { + "auxiliary_loss_clip": 0.01153159, + "auxiliary_loss_mlp": 0.0112194, + "balance_loss_clip": 1.00178659, + "balance_loss_mlp": 1.00072837, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.6459971863607785, + "language_loss": 0.80439055, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82714152, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.5299971103668213 + }, + { + "auxiliary_loss_clip": 0.01136249, + "auxiliary_loss_mlp": 0.01121859, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.0008378, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.1274285402984296, + "language_loss": 0.80706799, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82964909, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.5659682750701904 + }, + { + "auxiliary_loss_clip": 0.01143348, + "auxiliary_loss_mlp": 0.01123987, + "balance_loss_clip": 1.00215995, + "balance_loss_mlp": 1.00086761, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 3.279208286612942, + "language_loss": 0.73230273, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75497615, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.6654889583587646 + }, + { + "auxiliary_loss_clip": 0.01135869, + "auxiliary_loss_mlp": 0.00748414, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.00199735, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.843550415169051, + "language_loss": 0.72937375, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74821663, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.6771576404571533 + }, + { + "auxiliary_loss_clip": 0.01125962, + "auxiliary_loss_mlp": 0.0112234, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00074613, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.507013808971275, + "language_loss": 0.65833235, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68081534, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.683723211288452 + }, + { + "auxiliary_loss_clip": 0.01169793, + "auxiliary_loss_mlp": 0.01122835, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00066972, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.7182237969330858, + "language_loss": 0.82429647, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84722281, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.515281915664673 + }, + { + "auxiliary_loss_clip": 0.01137896, + "auxiliary_loss_mlp": 0.01122116, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00071311, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.5733439469926371, + "language_loss": 0.76692182, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78952193, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.6024229526519775 + }, + { + "auxiliary_loss_clip": 0.01138154, + "auxiliary_loss_mlp": 0.01122685, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00071025, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 2.2923063236493095, + "language_loss": 0.76097769, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78358608, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.6095778942108154 + }, + { + "auxiliary_loss_clip": 0.01121006, + "auxiliary_loss_mlp": 0.01122586, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.00061107, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 2.0385550785015347, + "language_loss": 0.84971464, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87215054, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.659374237060547 + }, + { + "auxiliary_loss_clip": 0.01141254, + "auxiliary_loss_mlp": 0.01123482, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.0007441, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.7736456384835428, + "language_loss": 0.80206859, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82471591, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.5973637104034424 + }, + { + "auxiliary_loss_clip": 0.01154722, + "auxiliary_loss_mlp": 0.01121956, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.0007441, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.562413148744841, + "language_loss": 0.80181956, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82458639, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.5396006107330322 + }, + { + "auxiliary_loss_clip": 0.01154532, + "auxiliary_loss_mlp": 0.01122543, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00056863, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 9.807403528424748, + "language_loss": 0.74843824, + "learning_rate": 2.493176309387897e-06, + "loss": 0.771209, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.531381845474243 + }, + { + "auxiliary_loss_clip": 0.01120809, + "auxiliary_loss_mlp": 0.01122445, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00056553, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.483426922676486, + "language_loss": 0.73489356, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75732601, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.6737234592437744 + }, + { + "auxiliary_loss_clip": 0.01136309, + "auxiliary_loss_mlp": 0.01122748, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00086856, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.6682303441662159, + "language_loss": 0.82525408, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84784466, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.556182622909546 + }, + { + "auxiliary_loss_clip": 0.01121233, + "auxiliary_loss_mlp": 0.01122275, + "balance_loss_clip": 1.00152087, + "balance_loss_mlp": 1.00058568, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.5332805155778053, + "language_loss": 0.84150529, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86394036, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.6356523036956787 + }, + { + "auxiliary_loss_clip": 0.01139712, + "auxiliary_loss_mlp": 0.01123003, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00074172, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.495374175845598, + "language_loss": 0.78090727, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80353445, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.641775608062744 + }, + { + "auxiliary_loss_clip": 0.01169604, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.0007627, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 1.6894364294844677, + "language_loss": 0.77867496, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80159748, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.5264432430267334 + }, + { + "auxiliary_loss_clip": 0.01121246, + "auxiliary_loss_mlp": 0.01122169, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.00067043, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5991241421191356, + "language_loss": 0.65084529, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67327946, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.728003740310669 + }, + { + "auxiliary_loss_clip": 0.01154483, + "auxiliary_loss_mlp": 0.01123267, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00072002, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.4608455875864155, + "language_loss": 0.74092114, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76369864, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.561497688293457 + }, + { + "auxiliary_loss_clip": 0.01125914, + "auxiliary_loss_mlp": 0.0112292, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.00084949, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.0849237412604715, + "language_loss": 0.78562677, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80811512, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.598970413208008 + }, + { + "auxiliary_loss_clip": 0.01121438, + "auxiliary_loss_mlp": 0.01123015, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.0008496, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.5768108399006484, + "language_loss": 0.72975188, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75219643, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 4.025218963623047 + }, + { + "auxiliary_loss_clip": 0.01121119, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00073195, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 1.6767090328816951, + "language_loss": 0.75614953, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77859545, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.6053664684295654 + }, + { + "auxiliary_loss_clip": 0.01153025, + "auxiliary_loss_mlp": 0.01122276, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.00068259, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.4758763884248676, + "language_loss": 0.69092286, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71367586, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.5458967685699463 + }, + { + "auxiliary_loss_clip": 0.0115482, + "auxiliary_loss_mlp": 0.01122236, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00064242, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.3710873586373047, + "language_loss": 0.70326877, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72603935, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.5868217945098877 + }, + { + "auxiliary_loss_clip": 0.01152389, + "auxiliary_loss_mlp": 0.01121736, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00052392, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.611204517712335, + "language_loss": 0.72290218, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74564344, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.5756540298461914 + }, + { + "auxiliary_loss_clip": 0.0114311, + "auxiliary_loss_mlp": 0.00748627, + "balance_loss_clip": 1.0020467, + "balance_loss_mlp": 1.00205946, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.9358935336465342, + "language_loss": 0.76679957, + "learning_rate": 2.487890389750719e-06, + "loss": 0.78571689, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 4.023249626159668 + }, + { + "auxiliary_loss_clip": 0.01136061, + "auxiliary_loss_mlp": 0.01121989, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00077677, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.686770307529047, + "language_loss": 0.70934319, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.7319237, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.6221628189086914 + }, + { + "auxiliary_loss_clip": 0.01107085, + "auxiliary_loss_mlp": 0.0112403, + "balance_loss_clip": 1.00187719, + "balance_loss_mlp": 1.00081527, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.8475183615751332, + "language_loss": 0.70283592, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72514707, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.7211780548095703 + }, + { + "auxiliary_loss_clip": 0.0113604, + "auxiliary_loss_mlp": 0.01122843, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00096345, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.7023104216416654, + "language_loss": 0.82521361, + "learning_rate": 2.486757219574983e-06, + "loss": 0.8478024, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 2.6473639011383057 + }, + { + "auxiliary_loss_clip": 0.01155023, + "auxiliary_loss_mlp": 0.01123762, + "balance_loss_clip": 1.00214505, + "balance_loss_mlp": 1.00073862, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.0227616506175248, + "language_loss": 0.68236756, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70515543, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 4.104346752166748 + }, + { + "auxiliary_loss_clip": 0.01138966, + "auxiliary_loss_mlp": 0.0074843, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.0019877, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.4035304707092022, + "language_loss": 0.77850437, + "learning_rate": 2.486001680477873e-06, + "loss": 0.7973783, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 2.6931469440460205 + }, + { + "auxiliary_loss_clip": 0.01136041, + "auxiliary_loss_mlp": 0.01122494, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00080562, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.5755443857645064, + "language_loss": 0.68684781, + "learning_rate": 2.485623883278308e-06, + "loss": 0.7094332, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 3.9786741733551025 + }, + { + "auxiliary_loss_clip": 0.01121472, + "auxiliary_loss_mlp": 0.01122472, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00068831, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.6842199587142899, + "language_loss": 0.6253683, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64780772, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.640774965286255 + }, + { + "auxiliary_loss_clip": 0.01169698, + "auxiliary_loss_mlp": 0.01123609, + "balance_loss_clip": 1.00206721, + "balance_loss_mlp": 1.00077593, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 3.2937470146992505, + "language_loss": 0.72460008, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.7475332, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.4713480472564697 + }, + { + "auxiliary_loss_clip": 0.01137361, + "auxiliary_loss_mlp": 0.01122295, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00060582, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6263724159042494, + "language_loss": 0.76888418, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79148078, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.596635580062866 + }, + { + "auxiliary_loss_clip": 0.01152732, + "auxiliary_loss_mlp": 0.01121497, + "balance_loss_clip": 1.00194061, + "balance_loss_mlp": 1.00037992, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.660972870296466, + "language_loss": 0.70614398, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72888625, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.553194761276245 + }, + { + "auxiliary_loss_clip": 0.01138173, + "auxiliary_loss_mlp": 0.00748411, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00201726, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.9715226736277898, + "language_loss": 0.76137424, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78024006, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.5868828296661377 + }, + { + "auxiliary_loss_clip": 0.01153206, + "auxiliary_loss_mlp": 0.01123371, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00082386, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 3.3341558380581877, + "language_loss": 0.82231289, + "learning_rate": 2.483356713869341e-06, + "loss": 0.84507871, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.5328221321105957 + }, + { + "auxiliary_loss_clip": 0.01122454, + "auxiliary_loss_mlp": 0.01122259, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00066555, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 1.8234665447487817, + "language_loss": 0.85037553, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87282264, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.6117517948150635 + }, + { + "auxiliary_loss_clip": 0.01137803, + "auxiliary_loss_mlp": 0.01123033, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00058091, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.871880802396429, + "language_loss": 0.68279529, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.70540363, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.5726118087768555 + }, + { + "auxiliary_loss_clip": 0.0113797, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00060153, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.8530220123390502, + "language_loss": 0.76736349, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78997278, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.5659520626068115 + }, + { + "auxiliary_loss_clip": 0.01137769, + "auxiliary_loss_mlp": 0.01122023, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00081134, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.212446290718443, + "language_loss": 0.7456497, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.7682476, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.590808391571045 + }, + { + "auxiliary_loss_clip": 0.01120652, + "auxiliary_loss_mlp": 0.01122374, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00068474, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 3.1987198931216407, + "language_loss": 0.64298266, + "learning_rate": 2.481466901851506e-06, + "loss": 0.6654129, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.6246609687805176 + }, + { + "auxiliary_loss_clip": 0.01138393, + "auxiliary_loss_mlp": 0.01123334, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.00078678, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 6.381688205530926, + "language_loss": 0.8002491, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82286644, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.5548794269561768 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01123198, + "balance_loss_clip": 1.00203192, + "balance_loss_mlp": 1.00084138, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4631598251825064, + "language_loss": 0.79639065, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81886733, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.6569550037384033 + }, + { + "auxiliary_loss_clip": 0.01154547, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00072336, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 7.989707277368174, + "language_loss": 0.79885161, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82162595, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.6078007221221924 + }, + { + "auxiliary_loss_clip": 0.01119483, + "auxiliary_loss_mlp": 0.01122181, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00077879, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.9373321897852884, + "language_loss": 0.69731617, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71973282, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.6963369846343994 + }, + { + "auxiliary_loss_clip": 0.01103091, + "auxiliary_loss_mlp": 0.01104503, + "balance_loss_clip": 1.00169206, + "balance_loss_mlp": 0.99998015, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8741969471113914, + "language_loss": 0.56869966, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59077561, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.289822578430176 + }, + { + "auxiliary_loss_clip": 0.01108869, + "auxiliary_loss_mlp": 0.01122584, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00079942, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.9133090670356137, + "language_loss": 0.76278442, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78509891, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.6919925212860107 + }, + { + "auxiliary_loss_clip": 0.01154017, + "auxiliary_loss_mlp": 0.0112244, + "balance_loss_clip": 1.00209367, + "balance_loss_mlp": 1.00084686, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.4602224618052788, + "language_loss": 0.80708277, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82984728, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.5341379642486572 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.0110486, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.0003376, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6778976771347041, + "language_loss": 0.54676604, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56903493, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.2150845527648926 + }, + { + "auxiliary_loss_clip": 0.01169735, + "auxiliary_loss_mlp": 0.01122204, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00061011, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.4808002198804409, + "language_loss": 0.69806468, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.7209841, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.597810745239258 + }, + { + "auxiliary_loss_clip": 0.01121018, + "auxiliary_loss_mlp": 0.01122457, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00067282, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.3441083546933614, + "language_loss": 0.76594836, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78838313, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.667149305343628 + }, + { + "auxiliary_loss_clip": 0.01137783, + "auxiliary_loss_mlp": 0.01122502, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00071776, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.7985865760965285, + "language_loss": 0.83794731, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86055017, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.541914939880371 + }, + { + "auxiliary_loss_clip": 0.01136224, + "auxiliary_loss_mlp": 0.01121799, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00058758, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 1.9155114358717968, + "language_loss": 0.78155798, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80413818, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.5905601978302 + }, + { + "auxiliary_loss_clip": 0.01154727, + "auxiliary_loss_mlp": 0.01122808, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.00092876, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.6049790377611293, + "language_loss": 0.73643613, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75921154, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.557605028152466 + }, + { + "auxiliary_loss_clip": 0.01136247, + "auxiliary_loss_mlp": 0.01122067, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00066471, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 2.106873307938227, + "language_loss": 0.7441079, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76669109, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.605743646621704 + }, + { + "auxiliary_loss_clip": 0.01106206, + "auxiliary_loss_mlp": 0.0112189, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00058246, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.550098732160817, + "language_loss": 0.76204765, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78432858, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.6802592277526855 + }, + { + "auxiliary_loss_clip": 0.01136185, + "auxiliary_loss_mlp": 0.01122095, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00069225, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.7196668844575234, + "language_loss": 0.7321254, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75470823, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 4.075127601623535 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01121745, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00062859, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 3.511314400319288, + "language_loss": 0.79806399, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.82048666, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.633239269256592 + }, + { + "auxiliary_loss_clip": 0.01120532, + "auxiliary_loss_mlp": 0.01123392, + "balance_loss_clip": 1.00173008, + "balance_loss_mlp": 1.00055861, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 1.9701374273763212, + "language_loss": 0.75473857, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77717781, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 2.624784231185913 + }, + { + "auxiliary_loss_clip": 0.01141531, + "auxiliary_loss_mlp": 0.01122555, + "balance_loss_clip": 1.00220895, + "balance_loss_mlp": 1.00077057, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.976393121583232, + "language_loss": 0.72657382, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74921465, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.533957004547119 + }, + { + "auxiliary_loss_clip": 0.01154728, + "auxiliary_loss_mlp": 0.01123214, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.0007627, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.066077080278027, + "language_loss": 0.627648, + "learning_rate": 2.473903107384165e-06, + "loss": 0.6504274, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.56512451171875 + }, + { + "auxiliary_loss_clip": 0.01136287, + "auxiliary_loss_mlp": 0.00746768, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00109339, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7500322878667469, + "language_loss": 0.5261122, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54494274, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 4.576421499252319 + }, + { + "auxiliary_loss_clip": 0.01138201, + "auxiliary_loss_mlp": 0.01123778, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.00084925, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.8070147651576156, + "language_loss": 0.70215499, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72477478, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.600212335586548 + }, + { + "auxiliary_loss_clip": 0.01088919, + "auxiliary_loss_mlp": 0.01121631, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00080085, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.4572427180610805, + "language_loss": 0.69632584, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71843135, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.7126753330230713 + }, + { + "auxiliary_loss_clip": 0.01133058, + "auxiliary_loss_mlp": 0.01105119, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00059652, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8944126099908378, + "language_loss": 0.64078468, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66316646, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 2.9564013481140137 + }, + { + "auxiliary_loss_clip": 0.01120461, + "auxiliary_loss_mlp": 0.01122215, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 1.9849571173126286, + "language_loss": 0.73454058, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75696731, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 4.078481435775757 + }, + { + "auxiliary_loss_clip": 0.01169537, + "auxiliary_loss_mlp": 0.01122121, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00062287, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 2.238931957527802, + "language_loss": 0.79620981, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81912637, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.5450119972229004 + }, + { + "auxiliary_loss_clip": 0.0112044, + "auxiliary_loss_mlp": 0.01122303, + "balance_loss_clip": 1.00172389, + "balance_loss_mlp": 1.00070989, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 2.6020224735268007, + "language_loss": 0.7662378, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78866524, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 4.020474433898926 + }, + { + "auxiliary_loss_clip": 0.01137116, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00054669, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7938026705161838, + "language_loss": 0.63776273, + "learning_rate": 2.470875570480556e-06, + "loss": 0.66017693, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.9095699787139893 + }, + { + "auxiliary_loss_clip": 0.01169653, + "auxiliary_loss_mlp": 0.01121934, + "balance_loss_clip": 1.00206304, + "balance_loss_mlp": 1.00043619, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.4832706237623097, + "language_loss": 0.85665846, + "learning_rate": 2.470497047866489e-06, + "loss": 0.8795743, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.571897506713867 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01122847, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00068116, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.7111644134623507, + "language_loss": 0.80215824, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82492864, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.528160572052002 + }, + { + "auxiliary_loss_clip": 0.01136341, + "auxiliary_loss_mlp": 0.01122797, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.0007267, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.8770533831857452, + "language_loss": 0.82789147, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.85048282, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.572225332260132 + }, + { + "auxiliary_loss_clip": 0.01153042, + "auxiliary_loss_mlp": 0.0112369, + "balance_loss_clip": 1.00206006, + "balance_loss_mlp": 1.00066638, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 4.3661740003514184, + "language_loss": 0.7034322, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72619951, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.5922162532806396 + }, + { + "auxiliary_loss_clip": 0.01142788, + "auxiliary_loss_mlp": 0.01122336, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.0005517, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6562916449074712, + "language_loss": 0.74400878, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76666003, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.56673264503479 + }, + { + "auxiliary_loss_clip": 0.01169637, + "auxiliary_loss_mlp": 0.01122787, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00071657, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.1765349515727506, + "language_loss": 0.80099845, + "learning_rate": 2.468604167463827e-06, + "loss": 0.82392263, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.4863038063049316 + }, + { + "auxiliary_loss_clip": 0.01127425, + "auxiliary_loss_mlp": 0.00748178, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00177085, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5942203242225936, + "language_loss": 0.73083758, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.74959362, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.6702921390533447 + }, + { + "auxiliary_loss_clip": 0.011367, + "auxiliary_loss_mlp": 0.01122713, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00073767, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.9227576963894055, + "language_loss": 0.87190449, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89449865, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.5958774089813232 + }, + { + "auxiliary_loss_clip": 0.01169767, + "auxiliary_loss_mlp": 0.01122631, + "balance_loss_clip": 1.00211155, + "balance_loss_mlp": 1.00065565, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 1.92111242400276, + "language_loss": 0.75785077, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78077471, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.473135471343994 + }, + { + "auxiliary_loss_clip": 0.01119619, + "auxiliary_loss_mlp": 0.01121394, + "balance_loss_clip": 1.00171244, + "balance_loss_mlp": 1.00075412, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 2.0765990614132996, + "language_loss": 0.6513446, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67375475, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.84065842628479 + }, + { + "auxiliary_loss_clip": 0.01169729, + "auxiliary_loss_mlp": 0.01123067, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00061488, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.7074314520900151, + "language_loss": 0.7804178, + "learning_rate": 2.466710842823274e-06, + "loss": 0.8033458, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.505659341812134 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.00748406, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00193882, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5189548967188957, + "language_loss": 0.7715115, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79037046, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.577857255935669 + }, + { + "auxiliary_loss_clip": 0.01137444, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_clip": 1.00197017, + "balance_loss_mlp": 1.00068283, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.5148182265520989, + "language_loss": 0.73502922, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75763035, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.6680803298950195 + }, + { + "auxiliary_loss_clip": 0.01136088, + "auxiliary_loss_mlp": 0.01122751, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00068045, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.8216432160118434, + "language_loss": 0.75621098, + "learning_rate": 2.465574635551405e-06, + "loss": 0.7787993, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.6457104682922363 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.0112284, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00076985, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.6060244849673762, + "language_loss": 0.69929802, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72190344, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.6135997772216797 + }, + { + "auxiliary_loss_clip": 0.01136467, + "auxiliary_loss_mlp": 0.01122475, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00059581, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.5690647129141913, + "language_loss": 0.694134, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71672344, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.555799961090088 + }, + { + "auxiliary_loss_clip": 0.01137955, + "auxiliary_loss_mlp": 0.01122578, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00069845, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 2.418468061765195, + "language_loss": 0.82980686, + "learning_rate": 2.464438269387809e-06, + "loss": 0.85241216, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.5553441047668457 + }, + { + "auxiliary_loss_clip": 0.01120832, + "auxiliary_loss_mlp": 0.01123753, + "balance_loss_clip": 1.00183189, + "balance_loss_mlp": 1.00063336, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.575566441596798, + "language_loss": 0.74882555, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77127141, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.588014602661133 + }, + { + "auxiliary_loss_clip": 0.01099009, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.00153744, + "balance_loss_mlp": 1.00022304, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.678182876122389, + "language_loss": 0.55636883, + "learning_rate": 2.463680603863743e-06, + "loss": 0.5783987, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.3081679344177246 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01122188, + "balance_loss_clip": 1.0019511, + "balance_loss_mlp": 1.0006901, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.6083627549771997, + "language_loss": 0.74422228, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76683164, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.6258866786956787 + }, + { + "auxiliary_loss_clip": 0.01138002, + "auxiliary_loss_mlp": 0.01121847, + "balance_loss_clip": 1.00194132, + "balance_loss_mlp": 1.00073051, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.6298575128723487, + "language_loss": 0.74183393, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76443237, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.603116750717163 + }, + { + "auxiliary_loss_clip": 0.01137502, + "auxiliary_loss_mlp": 0.01122043, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00044942, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 2.4197538033426182, + "language_loss": 0.73400211, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75659764, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.6122677326202393 + }, + { + "auxiliary_loss_clip": 0.01169662, + "auxiliary_loss_mlp": 0.01122181, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.00077832, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.3722459452824436, + "language_loss": 0.73916435, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76208276, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.5905277729034424 + }, + { + "auxiliary_loss_clip": 0.01137755, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00069427, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.9270569716031212, + "language_loss": 0.79791296, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82051528, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.5935451984405518 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01121917, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00060964, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.9568354287722345, + "language_loss": 0.71927959, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74172127, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 4.047601222991943 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01122426, + "balance_loss_clip": 1.00200069, + "balance_loss_mlp": 1.00073707, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.711230501640432, + "language_loss": 0.70538652, + "learning_rate": 2.461028221425126e-06, + "loss": 0.7283057, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.520993232727051 + }, + { + "auxiliary_loss_clip": 0.01154472, + "auxiliary_loss_mlp": 0.01121544, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00052261, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.112474564735236, + "language_loss": 0.67836595, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.7011261, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.5424225330352783 + }, + { + "auxiliary_loss_clip": 0.01122237, + "auxiliary_loss_mlp": 0.01122435, + "balance_loss_clip": 1.00179136, + "balance_loss_mlp": 1.00055528, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.815176291808838, + "language_loss": 0.83463168, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85707843, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.66231632232666 + }, + { + "auxiliary_loss_clip": 0.01151401, + "auxiliary_loss_mlp": 0.01103776, + "balance_loss_clip": 1.00166464, + "balance_loss_mlp": 1.00001609, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.773425787781989, + "language_loss": 0.55196774, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57451952, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.1964964866638184 + }, + { + "auxiliary_loss_clip": 0.01102461, + "auxiliary_loss_mlp": 0.01122037, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.00082517, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.1357449544212286, + "language_loss": 0.82816529, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.85041034, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 4.069897413253784 + }, + { + "auxiliary_loss_clip": 0.01169745, + "auxiliary_loss_mlp": 0.01122045, + "balance_loss_clip": 1.00207961, + "balance_loss_mlp": 1.00054681, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 2.332162831558129, + "language_loss": 0.83924145, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86215937, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.5069971084594727 + }, + { + "auxiliary_loss_clip": 0.01136084, + "auxiliary_loss_mlp": 0.01122495, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00071073, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.611714193634507, + "language_loss": 0.77098942, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79357523, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.5805704593658447 + }, + { + "auxiliary_loss_clip": 0.01152734, + "auxiliary_loss_mlp": 0.01121627, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00051057, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.890227939517629, + "language_loss": 0.76031244, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78305608, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.5643324851989746 + }, + { + "auxiliary_loss_clip": 0.01138686, + "auxiliary_loss_mlp": 0.01122382, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00069344, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 1.9023855186463372, + "language_loss": 0.69416636, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71677703, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.572936773300171 + }, + { + "auxiliary_loss_clip": 0.0109124, + "auxiliary_loss_mlp": 0.01122075, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00057673, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.5851344529716436, + "language_loss": 0.73142272, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75355589, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 4.187570333480835 + }, + { + "auxiliary_loss_clip": 0.0113721, + "auxiliary_loss_mlp": 0.01122366, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.0005821, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.4932574809550603, + "language_loss": 0.64779842, + "learning_rate": 2.457237618887458e-06, + "loss": 0.67039418, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.6837289333343506 + }, + { + "auxiliary_loss_clip": 0.0115302, + "auxiliary_loss_mlp": 0.01122213, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00071478, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.183871916027273, + "language_loss": 0.80380684, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82655919, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 3.979142665863037 + }, + { + "auxiliary_loss_clip": 0.01154681, + "auxiliary_loss_mlp": 0.0112309, + "balance_loss_clip": 1.00216508, + "balance_loss_mlp": 1.00092459, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.6452494730678129, + "language_loss": 0.65711612, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67989385, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.612708330154419 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01123134, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00058651, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.4692196191468714, + "language_loss": 0.76120639, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78381348, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.5624752044677734 + }, + { + "auxiliary_loss_clip": 0.01169686, + "auxiliary_loss_mlp": 0.01122431, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00074172, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.6350714628307126, + "language_loss": 0.81478608, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83770728, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.4886155128479004 + }, + { + "auxiliary_loss_clip": 0.01105411, + "auxiliary_loss_mlp": 0.01121577, + "balance_loss_clip": 1.00174284, + "balance_loss_mlp": 1.00055552, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6621267102882307, + "language_loss": 0.81620467, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83847463, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.67732834815979 + }, + { + "auxiliary_loss_clip": 0.01121672, + "auxiliary_loss_mlp": 0.01123518, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00068474, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.7247573435000483, + "language_loss": 0.69587767, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71832955, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.7599918842315674 + }, + { + "auxiliary_loss_clip": 0.01086136, + "auxiliary_loss_mlp": 0.01122187, + "balance_loss_clip": 1.00153983, + "balance_loss_mlp": 1.0007838, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 1.9634818754861387, + "language_loss": 0.71923542, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.74131858, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.6574790477752686 + }, + { + "auxiliary_loss_clip": 0.01152974, + "auxiliary_loss_mlp": 0.01122294, + "balance_loss_clip": 1.00197971, + "balance_loss_mlp": 1.00060487, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.5852114959294468, + "language_loss": 0.69031286, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71306551, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.5385384559631348 + }, + { + "auxiliary_loss_clip": 0.01153073, + "auxiliary_loss_mlp": 0.01122159, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00056565, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.0888859969477833, + "language_loss": 0.75163662, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77438891, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.6842312812805176 + }, + { + "auxiliary_loss_clip": 0.0115432, + "auxiliary_loss_mlp": 0.01121559, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00072885, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.0475376170826918, + "language_loss": 0.81404555, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83680433, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.5321037769317627 + }, + { + "auxiliary_loss_clip": 0.01137675, + "auxiliary_loss_mlp": 0.01121918, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.0007062, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.5901733136289062, + "language_loss": 0.73348767, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75608361, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.5639607906341553 + }, + { + "auxiliary_loss_clip": 0.01152775, + "auxiliary_loss_mlp": 0.01121509, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00067806, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 4.033470903845877, + "language_loss": 0.79922509, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.82196796, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.579789876937866 + }, + { + "auxiliary_loss_clip": 0.01154559, + "auxiliary_loss_mlp": 0.0112267, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00069463, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 2.258541778488086, + "language_loss": 0.80838597, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83115828, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.6374261379241943 + }, + { + "auxiliary_loss_clip": 0.01137793, + "auxiliary_loss_mlp": 0.01121825, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00089908, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 1.922086152235152, + "language_loss": 0.79616284, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81875908, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.5490524768829346 + }, + { + "auxiliary_loss_clip": 0.01137602, + "auxiliary_loss_mlp": 0.01121813, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00069666, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.7465805154451626, + "language_loss": 0.68594289, + "learning_rate": 2.451548468607584e-06, + "loss": 0.7085371, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.592132568359375 + }, + { + "auxiliary_loss_clip": 0.01154718, + "auxiliary_loss_mlp": 0.00748255, + "balance_loss_clip": 1.00218081, + "balance_loss_mlp": 1.00170231, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 5.183335842795897, + "language_loss": 0.80895376, + "learning_rate": 2.451169054403126e-06, + "loss": 0.8279835, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.5396950244903564 + }, + { + "auxiliary_loss_clip": 0.0115409, + "auxiliary_loss_mlp": 0.01121953, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00064576, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7585213271732736, + "language_loss": 0.67497385, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69773424, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.5747787952423096 + }, + { + "auxiliary_loss_clip": 0.01122281, + "auxiliary_loss_mlp": 0.01121975, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00085855, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.9024681282576603, + "language_loss": 0.69916904, + "learning_rate": 2.450410174683472e-06, + "loss": 0.72161162, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.5928494930267334 + }, + { + "auxiliary_loss_clip": 0.01122446, + "auxiliary_loss_mlp": 0.01121307, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00057149, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 2.491603897342718, + "language_loss": 0.72404009, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.7464776, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.639730930328369 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.00748121, + "balance_loss_clip": 1.00152087, + "balance_loss_mlp": 1.00169361, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.6517114154867916, + "language_loss": 0.84908724, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86759084, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.6755051612854004 + }, + { + "auxiliary_loss_clip": 0.01136144, + "auxiliary_loss_mlp": 0.01121424, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00068843, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.5791448127498626, + "language_loss": 0.8385278, + "learning_rate": 2.449271727042973e-06, + "loss": 0.86110353, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.645127534866333 + }, + { + "auxiliary_loss_clip": 0.0113573, + "auxiliary_loss_mlp": 0.01121875, + "balance_loss_clip": 1.00192964, + "balance_loss_mlp": 1.00056791, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.9368563218562047, + "language_loss": 0.77005827, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79263431, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.5731594562530518 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.01103088, + "balance_loss_clip": 1.00172365, + "balance_loss_mlp": 1.00009084, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7796456086528486, + "language_loss": 0.60079157, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62315583, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.131152391433716 + }, + { + "auxiliary_loss_clip": 0.01139457, + "auxiliary_loss_mlp": 0.0112229, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00060105, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.637521555348515, + "language_loss": 0.81923115, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84184861, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.588132619857788 + }, + { + "auxiliary_loss_clip": 0.01136018, + "auxiliary_loss_mlp": 0.01121215, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00057554, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.5254650572241026, + "language_loss": 0.74989533, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77246761, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.608713388442993 + }, + { + "auxiliary_loss_clip": 0.01120039, + "auxiliary_loss_mlp": 0.01120436, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00065458, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.689848509026034, + "language_loss": 0.65220332, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67460811, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 4.209133625030518 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01121883, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00076628, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.924099958771575, + "language_loss": 0.67835951, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.7009573, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.6050188541412354 + }, + { + "auxiliary_loss_clip": 0.01169544, + "auxiliary_loss_mlp": 0.01121653, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.00063205, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 2.216763354993932, + "language_loss": 0.7201584, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74307036, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.678009271621704 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01122072, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.0005734, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 1.9539207643185477, + "language_loss": 0.6518414, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67442358, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.603384017944336 + }, + { + "auxiliary_loss_clip": 0.01138229, + "auxiliary_loss_mlp": 0.01122611, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00063634, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 1.8750961454009007, + "language_loss": 0.73739958, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76000798, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.6026041507720947 + }, + { + "auxiliary_loss_clip": 0.01087586, + "auxiliary_loss_mlp": 0.01121623, + "balance_loss_clip": 1.00169587, + "balance_loss_mlp": 1.00060177, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 2.0575915783242, + "language_loss": 0.79036725, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81245935, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.71079158782959 + }, + { + "auxiliary_loss_clip": 0.01138185, + "auxiliary_loss_mlp": 0.01122307, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.00052261, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 1.8078965273218288, + "language_loss": 0.79985487, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82245982, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 3.9868478775024414 + }, + { + "auxiliary_loss_clip": 0.01152614, + "auxiliary_loss_mlp": 0.01121379, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00054836, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.70579561172874, + "language_loss": 0.76698196, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78972191, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.504993200302124 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0112193, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00071788, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.5277820993612106, + "language_loss": 0.83588147, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85847569, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.621471405029297 + }, + { + "auxiliary_loss_clip": 0.01169407, + "auxiliary_loss_mlp": 0.01121713, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00097811, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5508460421960983, + "language_loss": 0.84153712, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86444831, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.506941080093384 + }, + { + "auxiliary_loss_clip": 0.01122606, + "auxiliary_loss_mlp": 0.01122088, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00078082, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.591066271474681, + "language_loss": 0.81217152, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83461851, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.6635963916778564 + }, + { + "auxiliary_loss_clip": 0.01137587, + "auxiliary_loss_mlp": 0.01121636, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00099635, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.830744338704623, + "language_loss": 0.81477201, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83736432, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 3.9967234134674072 + }, + { + "auxiliary_loss_clip": 0.0115411, + "auxiliary_loss_mlp": 0.00748239, + "balance_loss_clip": 1.00201166, + "balance_loss_mlp": 1.00144327, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.5922131070649141, + "language_loss": 0.77619284, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79521632, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.6355252265930176 + }, + { + "auxiliary_loss_clip": 0.01122826, + "auxiliary_loss_mlp": 0.01122126, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00062776, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.5563635338634119, + "language_loss": 0.72864008, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.75108957, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 4.015040397644043 + }, + { + "auxiliary_loss_clip": 0.01138979, + "auxiliary_loss_mlp": 0.01121168, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00062335, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.534404094976207, + "language_loss": 0.75236589, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77496737, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.633547067642212 + }, + { + "auxiliary_loss_clip": 0.01090389, + "auxiliary_loss_mlp": 0.01120849, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.0007813, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 2.7879934952409364, + "language_loss": 0.76102811, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78314054, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.6839041709899902 + }, + { + "auxiliary_loss_clip": 0.01169441, + "auxiliary_loss_mlp": 0.011226, + "balance_loss_clip": 1.00205719, + "balance_loss_mlp": 1.00081563, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4041895155232182, + "language_loss": 0.64918792, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67210829, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.5094025135040283 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.0006907, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.652363283284371, + "language_loss": 0.78907359, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81164223, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.599841356277466 + }, + { + "auxiliary_loss_clip": 0.01152542, + "auxiliary_loss_mlp": 0.01120174, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00058281, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.4428015923600845, + "language_loss": 0.80163074, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82435787, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.577287435531616 + }, + { + "auxiliary_loss_clip": 0.01152609, + "auxiliary_loss_mlp": 0.0112138, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00064468, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 2.0000073444008883, + "language_loss": 0.77675867, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79949856, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.539351463317871 + }, + { + "auxiliary_loss_clip": 0.01136172, + "auxiliary_loss_mlp": 0.00748171, + "balance_loss_clip": 1.00162888, + "balance_loss_mlp": 1.00139582, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.6093688407920845, + "language_loss": 0.64366508, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66250843, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.622574806213379 + }, + { + "auxiliary_loss_clip": 0.01158166, + "auxiliary_loss_mlp": 0.01121733, + "balance_loss_clip": 1.00264859, + "balance_loss_mlp": 1.00061643, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 2.0157341890263205, + "language_loss": 0.75460321, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77740216, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.535207986831665 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.01121369, + "balance_loss_clip": 1.00183308, + "balance_loss_mlp": 1.00063348, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.6174828734056934, + "language_loss": 0.78043067, + "learning_rate": 2.439018845165806e-06, + "loss": 0.8030262, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.5614194869995117 + }, + { + "auxiliary_loss_clip": 0.0115313, + "auxiliary_loss_mlp": 0.01121674, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00065267, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5663439888914858, + "language_loss": 0.91064286, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93339092, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.5874712467193604 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.00748247, + "balance_loss_clip": 1.00194442, + "balance_loss_mlp": 1.0013659, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.5299950186095468, + "language_loss": 0.80053914, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81938243, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.6242318153381348 + }, + { + "auxiliary_loss_clip": 0.01137072, + "auxiliary_loss_mlp": 0.01121789, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00067246, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 4.063378476245298, + "language_loss": 0.80083644, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82342505, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.576911449432373 + }, + { + "auxiliary_loss_clip": 0.01119306, + "auxiliary_loss_mlp": 0.01121423, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00078297, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 1.9721865905677476, + "language_loss": 0.76623261, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78863984, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.6300413608551025 + }, + { + "auxiliary_loss_clip": 0.01152454, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.0007503, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.7511735540655284, + "language_loss": 0.77334237, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79607606, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.637671709060669 + }, + { + "auxiliary_loss_clip": 0.01152999, + "auxiliary_loss_mlp": 0.0112182, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00060821, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.9606442645219797, + "language_loss": 0.64787889, + "learning_rate": 2.436738768872905e-06, + "loss": 0.67062706, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.5610218048095703 + }, + { + "auxiliary_loss_clip": 0.01137647, + "auxiliary_loss_mlp": 0.01121304, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00056839, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.6222889176749704, + "language_loss": 0.83517206, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85776156, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.628041982650757 + }, + { + "auxiliary_loss_clip": 0.01107828, + "auxiliary_loss_mlp": 0.01122849, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.0005877, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 2.0271522179321706, + "language_loss": 0.79265738, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81496418, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.692939519882202 + }, + { + "auxiliary_loss_clip": 0.01102504, + "auxiliary_loss_mlp": 0.01121981, + "balance_loss_clip": 1.00175273, + "balance_loss_mlp": 1.00067329, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.6984829170332922, + "language_loss": 0.72066581, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74291068, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.688058376312256 + }, + { + "auxiliary_loss_clip": 0.0110921, + "auxiliary_loss_mlp": 0.01122052, + "balance_loss_clip": 1.00218713, + "balance_loss_mlp": 1.00064898, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 2.510704397836048, + "language_loss": 0.67423987, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69655246, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.7792882919311523 + }, + { + "auxiliary_loss_clip": 0.01139674, + "auxiliary_loss_mlp": 0.01122559, + "balance_loss_clip": 1.0020467, + "balance_loss_mlp": 1.00077522, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6060408301993698, + "language_loss": 0.73925006, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76187241, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.635646104812622 + }, + { + "auxiliary_loss_clip": 0.01106557, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.0017767, + "balance_loss_mlp": 1.00059175, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.6002118859091787, + "language_loss": 0.74232078, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76460248, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.7467856407165527 + }, + { + "auxiliary_loss_clip": 0.0112057, + "auxiliary_loss_mlp": 0.01122611, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00082684, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.8439694328746965, + "language_loss": 0.74539399, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.76782584, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.6617469787597656 + }, + { + "auxiliary_loss_clip": 0.01169567, + "auxiliary_loss_mlp": 0.01122134, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00054061, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.9791066481050297, + "language_loss": 0.74447334, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76739037, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.599590539932251 + }, + { + "auxiliary_loss_clip": 0.01139493, + "auxiliary_loss_mlp": 0.01121962, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00065422, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.6337673709285723, + "language_loss": 0.77489138, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79750586, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.5928242206573486 + }, + { + "auxiliary_loss_clip": 0.01153031, + "auxiliary_loss_mlp": 0.01121935, + "balance_loss_clip": 1.00207651, + "balance_loss_mlp": 1.00062776, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.9965683463448087, + "language_loss": 0.84654725, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.86929691, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 4.029786825180054 + }, + { + "auxiliary_loss_clip": 0.0112293, + "auxiliary_loss_mlp": 0.01123073, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00062108, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 1.88058036779667, + "language_loss": 0.63616276, + "learning_rate": 2.432557082778765e-06, + "loss": 0.6586228, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.6502926349639893 + }, + { + "auxiliary_loss_clip": 0.01149492, + "auxiliary_loss_mlp": 0.01102989, + "balance_loss_clip": 1.0017904, + "balance_loss_mlp": 0.99999249, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7284773510477556, + "language_loss": 0.50132668, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52385151, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 2.977294921875 + }, + { + "auxiliary_loss_clip": 0.01166129, + "auxiliary_loss_mlp": 0.01103074, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00007725, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7774882402487543, + "language_loss": 0.59327179, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61596382, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.128019094467163 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00081801, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.9837412577593043, + "language_loss": 0.59123152, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61365551, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.8324506282806396 + }, + { + "auxiliary_loss_clip": 0.01137925, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00053823, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.8859008832376596, + "language_loss": 0.80055296, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8231554, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 4.010592937469482 + }, + { + "auxiliary_loss_clip": 0.01169539, + "auxiliary_loss_mlp": 0.01121856, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00083458, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 1.960039902035407, + "language_loss": 0.79132801, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81424195, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.500369071960449 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01103305, + "balance_loss_clip": 1.00165415, + "balance_loss_mlp": 1.00030792, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8368768431116949, + "language_loss": 0.62853265, + "learning_rate": 2.430275325332681e-06, + "loss": 0.65057427, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.3002655506134033 + }, + { + "auxiliary_loss_clip": 0.01169459, + "auxiliary_loss_mlp": 0.01121935, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00062799, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.8263513448868882, + "language_loss": 0.6252473, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64816129, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.5171725749969482 + }, + { + "auxiliary_loss_clip": 0.011532, + "auxiliary_loss_mlp": 0.01103065, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00006819, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7644254083067069, + "language_loss": 0.57096183, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59352458, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 3.0038843154907227 + }, + { + "auxiliary_loss_clip": 0.01138868, + "auxiliary_loss_mlp": 0.01121963, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00065541, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.0147013145881734, + "language_loss": 0.74985957, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.77246785, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.5544075965881348 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01121303, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00066304, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.822866030568607, + "language_loss": 0.76424032, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78680992, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 4.194303035736084 + }, + { + "auxiliary_loss_clip": 0.0116951, + "auxiliary_loss_mlp": 0.01121633, + "balance_loss_clip": 1.00214052, + "balance_loss_mlp": 1.00070751, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.7487613034033602, + "language_loss": 0.76360476, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78651619, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 4.004436016082764 + }, + { + "auxiliary_loss_clip": 0.01154547, + "auxiliary_loss_mlp": 0.01122215, + "balance_loss_clip": 1.00201547, + "balance_loss_mlp": 1.0006212, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9612900751883415, + "language_loss": 0.68024909, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.7030167, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.5358195304870605 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01122175, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00067639, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.6320712229161631, + "language_loss": 0.71764684, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74007821, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.6149964332580566 + }, + { + "auxiliary_loss_clip": 0.01154698, + "auxiliary_loss_mlp": 0.01121588, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00075793, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 2.0376619670493112, + "language_loss": 0.69466978, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71743262, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.545811891555786 + }, + { + "auxiliary_loss_clip": 0.01169546, + "auxiliary_loss_mlp": 0.01121434, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00060344, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.816611810046, + "language_loss": 0.7678358, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79074562, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.494044780731201 + }, + { + "auxiliary_loss_clip": 0.01169445, + "auxiliary_loss_mlp": 0.01122103, + "balance_loss_clip": 1.00197446, + "balance_loss_mlp": 1.00060523, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.606145127606025, + "language_loss": 0.67607617, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.69899166, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.5337841510772705 + }, + { + "auxiliary_loss_clip": 0.01166032, + "auxiliary_loss_mlp": 0.0110322, + "balance_loss_clip": 1.00171065, + "balance_loss_mlp": 1.00022316, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7508734671925592, + "language_loss": 0.54459131, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56728387, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.111638069152832 + }, + { + "auxiliary_loss_clip": 0.01154033, + "auxiliary_loss_mlp": 0.01120578, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00060558, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.9133948535882217, + "language_loss": 0.75321805, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77596414, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.5922415256500244 + }, + { + "auxiliary_loss_clip": 0.0115249, + "auxiliary_loss_mlp": 0.01121154, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00060928, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8720907276376677, + "language_loss": 0.74001908, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76275551, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.542940616607666 + }, + { + "auxiliary_loss_clip": 0.01140864, + "auxiliary_loss_mlp": 0.01122182, + "balance_loss_clip": 1.0024271, + "balance_loss_mlp": 1.00058818, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.2511089598878824, + "language_loss": 0.80181205, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82444251, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.663562059402466 + }, + { + "auxiliary_loss_clip": 0.01137514, + "auxiliary_loss_mlp": 0.01121831, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00052381, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.6430719190230576, + "language_loss": 0.80430317, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82689667, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.590372085571289 + }, + { + "auxiliary_loss_clip": 0.01102652, + "auxiliary_loss_mlp": 0.01120775, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00070763, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.9497917988512627, + "language_loss": 0.75121772, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77345198, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.6611990928649902 + }, + { + "auxiliary_loss_clip": 0.01138069, + "auxiliary_loss_mlp": 0.01120626, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00046313, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 2.8067169316223306, + "language_loss": 0.70460725, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.72719419, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.6012580394744873 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.01121839, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00062668, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 2.883201455567965, + "language_loss": 0.71999669, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74274659, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.5379979610443115 + }, + { + "auxiliary_loss_clip": 0.01123161, + "auxiliary_loss_mlp": 0.01121459, + "balance_loss_clip": 1.00171292, + "balance_loss_mlp": 1.00072396, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 1.9625634256896147, + "language_loss": 0.76754493, + "learning_rate": 2.423045899863634e-06, + "loss": 0.78999114, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.6453394889831543 + }, + { + "auxiliary_loss_clip": 0.01169486, + "auxiliary_loss_mlp": 0.01121608, + "balance_loss_clip": 1.00210965, + "balance_loss_mlp": 1.00068247, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.5549169870764472, + "language_loss": 0.70392227, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72683322, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.5435757637023926 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01102207, + "balance_loss_clip": 1.00171614, + "balance_loss_mlp": 0.99997276, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7421882198705051, + "language_loss": 0.61685967, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63939369, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.0936524868011475 + }, + { + "auxiliary_loss_clip": 0.01169423, + "auxiliary_loss_mlp": 0.00748069, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00121248, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 1.9251717218440159, + "language_loss": 0.7755115, + "learning_rate": 2.421903879707657e-06, + "loss": 0.79468644, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.502981424331665 + }, + { + "auxiliary_loss_clip": 0.01110731, + "auxiliary_loss_mlp": 0.01120596, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00071919, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.611331754273869, + "language_loss": 0.72179633, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74410963, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.6773276329040527 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01121392, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00056171, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.8164786217544853, + "language_loss": 0.76895261, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79122096, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.738351821899414 + }, + { + "auxiliary_loss_clip": 0.01154168, + "auxiliary_loss_mlp": 0.00747994, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00119805, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 4.0298856866652635, + "language_loss": 0.71800566, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73702729, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.592076301574707 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01122024, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.056339579083892, + "language_loss": 0.67894781, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70156026, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.5795629024505615 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.01120491, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00071001, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8780673936522705, + "language_loss": 0.89272463, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91530216, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.5727901458740234 + }, + { + "auxiliary_loss_clip": 0.0109257, + "auxiliary_loss_mlp": 0.01121483, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00084341, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 1.67442244867526, + "language_loss": 0.75704694, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77918744, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.7145936489105225 + }, + { + "auxiliary_loss_clip": 0.01122394, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00059938, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 1.8848695116279817, + "language_loss": 0.79418099, + "learning_rate": 2.419238606731815e-06, + "loss": 0.81662691, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.6213860511779785 + }, + { + "auxiliary_loss_clip": 0.01137612, + "auxiliary_loss_mlp": 0.0112108, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00063097, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.9867374035536294, + "language_loss": 0.68467247, + "learning_rate": 2.418857789743758e-06, + "loss": 0.7072593, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 4.0994298458099365 + }, + { + "auxiliary_loss_clip": 0.01152758, + "auxiliary_loss_mlp": 0.01122282, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.0007844, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 1.8999091858718504, + "language_loss": 0.84409273, + "learning_rate": 2.418476956872571e-06, + "loss": 0.8668431, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.524977445602417 + }, + { + "auxiliary_loss_clip": 0.01139644, + "auxiliary_loss_mlp": 0.01121545, + "balance_loss_clip": 1.00207675, + "balance_loss_mlp": 1.00081015, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.7960253891830174, + "language_loss": 0.80769885, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.8303107, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.6539008617401123 + }, + { + "auxiliary_loss_clip": 0.01106767, + "auxiliary_loss_mlp": 0.01121782, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00037909, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.1710908961385313, + "language_loss": 0.74412835, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.76641393, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.6354432106018066 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01102171, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 0.99993688, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7909591486871093, + "language_loss": 0.58650959, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60888934, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.18843412399292 + }, + { + "auxiliary_loss_clip": 0.0115437, + "auxiliary_loss_mlp": 0.01121595, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00057387, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 4.440928331014789, + "language_loss": 0.83062559, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85338527, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.5286545753479004 + }, + { + "auxiliary_loss_clip": 0.0116933, + "auxiliary_loss_mlp": 0.01120629, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00075209, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5347560538055485, + "language_loss": 0.77161431, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79451394, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 3.9767723083496094 + }, + { + "auxiliary_loss_clip": 0.01154607, + "auxiliary_loss_mlp": 0.01122771, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00079584, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 1.9235309293180938, + "language_loss": 0.71685493, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73962867, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.5979466438293457 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01121955, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00064778, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.0965259323427796, + "language_loss": 0.6915313, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71413839, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.570122003555298 + }, + { + "auxiliary_loss_clip": 0.01134644, + "auxiliary_loss_mlp": 0.01102278, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.00004387, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7829479235547452, + "language_loss": 0.56622899, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58859825, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.1128461360931396 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01121268, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00081933, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.6718801305793611, + "language_loss": 0.79505527, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81779462, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.5774011611938477 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.00747886, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00119638, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.0939740914435547, + "language_loss": 0.92258525, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94128972, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 4.1475160121917725 + }, + { + "auxiliary_loss_clip": 0.01149279, + "auxiliary_loss_mlp": 0.01102262, + "balance_loss_clip": 1.00155854, + "balance_loss_mlp": 1.00002789, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.809433757186319, + "language_loss": 0.6288656, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65138102, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.158190965652466 + }, + { + "auxiliary_loss_clip": 0.01169431, + "auxiliary_loss_mlp": 0.01120438, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.00075173, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.5950465601418362, + "language_loss": 0.82302058, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84591925, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 4.010816335678101 + }, + { + "auxiliary_loss_clip": 0.01154304, + "auxiliary_loss_mlp": 0.01121908, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00069594, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.8086915350569568, + "language_loss": 0.85534734, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87810946, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.708648681640625 + }, + { + "auxiliary_loss_clip": 0.01169536, + "auxiliary_loss_mlp": 0.0112095, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00059676, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.0170254445512223, + "language_loss": 0.76508534, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78799021, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.5272817611694336 + }, + { + "auxiliary_loss_clip": 0.01120954, + "auxiliary_loss_mlp": 0.01121388, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00065291, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 4.514689573873343, + "language_loss": 0.74920756, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.771631, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.6057775020599365 + }, + { + "auxiliary_loss_clip": 0.0116957, + "auxiliary_loss_mlp": 0.01121526, + "balance_loss_clip": 1.0020858, + "balance_loss_mlp": 1.00069559, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.8965182929921245, + "language_loss": 0.700019, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72292995, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.5354084968566895 + }, + { + "auxiliary_loss_clip": 0.01105457, + "auxiliary_loss_mlp": 0.01121484, + "balance_loss_clip": 1.00166488, + "balance_loss_mlp": 1.00055861, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 2.193082579236089, + "language_loss": 0.77304387, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79531324, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.670653820037842 + }, + { + "auxiliary_loss_clip": 0.01104528, + "auxiliary_loss_mlp": 0.01121283, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00064325, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 3.3388580090746065, + "language_loss": 0.6234377, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64569581, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.6592698097229004 + }, + { + "auxiliary_loss_clip": 0.01169579, + "auxiliary_loss_mlp": 0.01122133, + "balance_loss_clip": 1.00208223, + "balance_loss_mlp": 1.00073075, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9142575154438872, + "language_loss": 0.84676898, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86968613, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.486907958984375 + }, + { + "auxiliary_loss_clip": 0.01135338, + "auxiliary_loss_mlp": 0.0112033, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00064373, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.422366587539057, + "language_loss": 0.79573512, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81829184, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.580014944076538 + }, + { + "auxiliary_loss_clip": 0.01140088, + "auxiliary_loss_mlp": 0.01121577, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00065112, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 1.8227813463220615, + "language_loss": 0.81006932, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83268595, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.561037540435791 + }, + { + "auxiliary_loss_clip": 0.0112486, + "auxiliary_loss_mlp": 0.01120797, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00063372, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.727438049856926, + "language_loss": 0.63626707, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65872365, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.6150779724121094 + }, + { + "auxiliary_loss_clip": 0.01117318, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00149846, + "balance_loss_mlp": 1.00056446, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8316459397848842, + "language_loss": 0.5886209, + "learning_rate": 2.409713450313968e-06, + "loss": 0.61082971, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.2277934551239014 + }, + { + "auxiliary_loss_clip": 0.01103904, + "auxiliary_loss_mlp": 0.01120852, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00068903, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6139874519407638, + "language_loss": 0.79145443, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81370205, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.6642184257507324 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01121383, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00055289, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5632750713315302, + "language_loss": 0.73895484, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76138002, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.6603739261627197 + }, + { + "auxiliary_loss_clip": 0.01152557, + "auxiliary_loss_mlp": 0.01120397, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.0006156, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.8342784114147523, + "language_loss": 0.79152763, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81425714, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.530073404312134 + }, + { + "auxiliary_loss_clip": 0.01169456, + "auxiliary_loss_mlp": 0.01120808, + "balance_loss_clip": 1.00214934, + "balance_loss_mlp": 1.00074077, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.767822017262681, + "language_loss": 0.72964621, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75254887, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.5331473350524902 + }, + { + "auxiliary_loss_clip": 0.01169372, + "auxiliary_loss_mlp": 0.01121254, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00051832, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 2.5115664559362685, + "language_loss": 0.77287257, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79577881, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.5121870040893555 + }, + { + "auxiliary_loss_clip": 0.01152725, + "auxiliary_loss_mlp": 0.01121057, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00079906, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.5506889904239454, + "language_loss": 0.78830028, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81103808, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.5740325450897217 + }, + { + "auxiliary_loss_clip": 0.01121199, + "auxiliary_loss_mlp": 0.01121411, + "balance_loss_clip": 1.00191462, + "balance_loss_mlp": 1.00058079, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.8250473553772246, + "language_loss": 0.86756021, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.8899864, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.7026333808898926 + }, + { + "auxiliary_loss_clip": 0.01152568, + "auxiliary_loss_mlp": 0.01119813, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00050795, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.5027225225289238, + "language_loss": 0.66895127, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69167501, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.5773637294769287 + }, + { + "auxiliary_loss_clip": 0.01152998, + "auxiliary_loss_mlp": 0.0112114, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00040472, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 6.764880119893048, + "language_loss": 0.69768584, + "learning_rate": 2.406282005146318e-06, + "loss": 0.72042722, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.5481314659118652 + }, + { + "auxiliary_loss_clip": 0.01154391, + "auxiliary_loss_mlp": 0.0112127, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00063074, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.618182683811755, + "language_loss": 0.81532061, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83807719, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.511620283126831 + }, + { + "auxiliary_loss_clip": 0.01169501, + "auxiliary_loss_mlp": 0.01121255, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00061536, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 8.340228359004268, + "language_loss": 0.65540075, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67830837, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.4854578971862793 + }, + { + "auxiliary_loss_clip": 0.01122061, + "auxiliary_loss_mlp": 0.01121018, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00066459, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.7973295585609839, + "language_loss": 0.62825221, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65068299, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.6089489459991455 + }, + { + "auxiliary_loss_clip": 0.01152868, + "auxiliary_loss_mlp": 0.0112098, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.00062656, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.3148748557399637, + "language_loss": 0.59338838, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61612684, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.8154118061065674 + }, + { + "auxiliary_loss_clip": 0.01152785, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_clip": 1.00211525, + "balance_loss_mlp": 1.00080633, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.4489929612492738, + "language_loss": 0.72526509, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74800932, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.5714497566223145 + }, + { + "auxiliary_loss_clip": 0.01137886, + "auxiliary_loss_mlp": 0.0112117, + "balance_loss_clip": 1.00216126, + "balance_loss_mlp": 1.00072098, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.750303037635045, + "language_loss": 0.75703907, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 4.027543306350708 + }, + { + "auxiliary_loss_clip": 0.01137141, + "auxiliary_loss_mlp": 0.01121239, + "balance_loss_clip": 1.00200236, + "balance_loss_mlp": 1.00078964, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.7961533247702903, + "language_loss": 0.68114108, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70372486, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.593935489654541 + }, + { + "auxiliary_loss_clip": 0.0115265, + "auxiliary_loss_mlp": 0.01121129, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00067997, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.458207420887846, + "language_loss": 0.61237925, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63511705, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.626383066177368 + }, + { + "auxiliary_loss_clip": 0.01153033, + "auxiliary_loss_mlp": 0.01121936, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00062847, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 1.801721855363876, + "language_loss": 0.7838968, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80664647, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.5404469966888428 + }, + { + "auxiliary_loss_clip": 0.01107388, + "auxiliary_loss_mlp": 0.01121475, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00073946, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.624953638239969, + "language_loss": 0.6406669, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.66295552, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.6872642040252686 + }, + { + "auxiliary_loss_clip": 0.01153531, + "auxiliary_loss_mlp": 0.01120863, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00069988, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.596391809625729, + "language_loss": 0.79399097, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81673485, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 3.99493408203125 + }, + { + "auxiliary_loss_clip": 0.01137646, + "auxiliary_loss_mlp": 0.01121036, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00049162, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 3.58987835709291, + "language_loss": 0.81032169, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83290851, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.6260478496551514 + }, + { + "auxiliary_loss_clip": 0.01125416, + "auxiliary_loss_mlp": 0.01121076, + "balance_loss_clip": 1.00239563, + "balance_loss_mlp": 1.00053132, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.6342401659039525, + "language_loss": 0.65256983, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67503476, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.7093710899353027 + }, + { + "auxiliary_loss_clip": 0.01139578, + "auxiliary_loss_mlp": 0.01120928, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.0006696, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.626009453770701, + "language_loss": 0.75371641, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77632147, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.6325316429138184 + }, + { + "auxiliary_loss_clip": 0.01169479, + "auxiliary_loss_mlp": 0.0112066, + "balance_loss_clip": 1.00215983, + "balance_loss_mlp": 1.00068736, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 2.4053077640072207, + "language_loss": 0.7263577, + "learning_rate": 2.400560161948384e-06, + "loss": 0.749259, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.494199275970459 + }, + { + "auxiliary_loss_clip": 0.01122115, + "auxiliary_loss_mlp": 0.01121231, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.0005908, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6934878226455974, + "language_loss": 0.76302874, + "learning_rate": 2.400178583680834e-06, + "loss": 0.7854622, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 4.014787197113037 + }, + { + "auxiliary_loss_clip": 0.01169361, + "auxiliary_loss_mlp": 0.01120808, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.00074077, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.9961054983394166, + "language_loss": 0.67057526, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69347697, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 3.9510669708251953 + }, + { + "auxiliary_loss_clip": 0.0115279, + "auxiliary_loss_mlp": 0.01120979, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00062537, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.6414341876223904, + "language_loss": 0.78868914, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81142688, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.5456833839416504 + }, + { + "auxiliary_loss_clip": 0.01120845, + "auxiliary_loss_mlp": 0.01122765, + "balance_loss_clip": 1.00176156, + "balance_loss_mlp": 1.00059915, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.6673228094681265, + "language_loss": 0.83241707, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85485315, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.6454734802246094 + }, + { + "auxiliary_loss_clip": 0.01136136, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00066137, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.4786361663223095, + "language_loss": 0.76749998, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.79007626, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.6190061569213867 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01120923, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00056982, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.512429999407447, + "language_loss": 0.80744696, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82984656, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.6191790103912354 + }, + { + "auxiliary_loss_clip": 0.01137913, + "auxiliary_loss_mlp": 0.01120982, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00053334, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 1.843169840820569, + "language_loss": 0.7552979, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.77788681, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.585387706756592 + }, + { + "auxiliary_loss_clip": 0.01154409, + "auxiliary_loss_mlp": 0.01120888, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00043881, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.1661286464583807, + "language_loss": 0.75515801, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.77791101, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.5522758960723877 + }, + { + "auxiliary_loss_clip": 0.01150876, + "auxiliary_loss_mlp": 0.01102226, + "balance_loss_clip": 1.00141966, + "balance_loss_mlp": 0.99999207, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7823932841267932, + "language_loss": 0.62428987, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.6468209, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.1624066829681396 + }, + { + "auxiliary_loss_clip": 0.01169373, + "auxiliary_loss_mlp": 0.01121644, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00081372, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 2.2494459775789277, + "language_loss": 0.65358543, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67649567, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.4895215034484863 + }, + { + "auxiliary_loss_clip": 0.01138666, + "auxiliary_loss_mlp": 0.01121306, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00066638, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.7762919560504389, + "language_loss": 0.85001075, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87261045, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.604311943054199 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.01120964, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00051475, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.6290944812406307, + "language_loss": 0.76541913, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78799975, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.751359462738037 + }, + { + "auxiliary_loss_clip": 0.01136144, + "auxiliary_loss_mlp": 0.01120807, + "balance_loss_clip": 1.00168371, + "balance_loss_mlp": 1.00045347, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 2.0951815432878886, + "language_loss": 0.80388278, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82645237, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 2.5956032276153564 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.00747973, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00106406, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.5744786341497403, + "language_loss": 0.75983596, + "learning_rate": 2.395216690562469e-06, + "loss": 0.77885979, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.59759259223938 + }, + { + "auxiliary_loss_clip": 0.01120788, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.00080752, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 2.0604492272369557, + "language_loss": 0.75433111, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77675444, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.6663901805877686 + }, + { + "auxiliary_loss_clip": 0.01137214, + "auxiliary_loss_mlp": 0.01120085, + "balance_loss_clip": 1.00201213, + "balance_loss_mlp": 1.0004946, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5619029129326998, + "language_loss": 0.72500956, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74758255, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.657301187515259 + }, + { + "auxiliary_loss_clip": 0.01136781, + "auxiliary_loss_mlp": 0.01121959, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00065172, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.3655673439593814, + "language_loss": 0.75506067, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77764809, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.600771427154541 + }, + { + "auxiliary_loss_clip": 0.01154439, + "auxiliary_loss_mlp": 0.01121022, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00057316, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0959742015661544, + "language_loss": 0.69528651, + "learning_rate": 2.393689443195573e-06, + "loss": 0.71804112, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.545400619506836 + }, + { + "auxiliary_loss_clip": 0.0116928, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00067604, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 1.8638908250033723, + "language_loss": 0.72605509, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74895442, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.5541844367980957 + }, + { + "auxiliary_loss_clip": 0.01120414, + "auxiliary_loss_mlp": 0.01120464, + "balance_loss_clip": 1.00185788, + "balance_loss_mlp": 1.0004921, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.6417682430227467, + "language_loss": 0.65111232, + "learning_rate": 2.392925729881751e-06, + "loss": 0.6735211, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.699842929840088 + }, + { + "auxiliary_loss_clip": 0.01153082, + "auxiliary_loss_mlp": 0.01120245, + "balance_loss_clip": 1.00224507, + "balance_loss_mlp": 1.00065398, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6322611649945267, + "language_loss": 0.68627417, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70900738, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.568450689315796 + }, + { + "auxiliary_loss_clip": 0.01154629, + "auxiliary_loss_mlp": 0.01121375, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00063968, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.905242693366571, + "language_loss": 0.79654658, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81930661, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.5043909549713135 + }, + { + "auxiliary_loss_clip": 0.01148953, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_clip": 1.0014298, + "balance_loss_mlp": 1.00000751, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8120509487680928, + "language_loss": 0.57710606, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59962559, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.090306282043457 + }, + { + "auxiliary_loss_clip": 0.01090124, + "auxiliary_loss_mlp": 0.01119978, + "balance_loss_clip": 1.00209808, + "balance_loss_mlp": 1.00067341, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3903405740290495, + "language_loss": 0.76574308, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78784406, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.01136099, + "auxiliary_loss_mlp": 0.01122081, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00048733, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.4253927952998358, + "language_loss": 0.76676357, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.78934538, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.5769031047821045 + }, + { + "auxiliary_loss_clip": 0.01071882, + "auxiliary_loss_mlp": 0.01120064, + "balance_loss_clip": 1.00159967, + "balance_loss_mlp": 1.00066447, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.307594954077503, + "language_loss": 0.72495019, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74686956, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.7933542728424072 + }, + { + "auxiliary_loss_clip": 0.01169549, + "auxiliary_loss_mlp": 0.01121977, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00066972, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 2.154877189412961, + "language_loss": 0.6289801, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65189534, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.5813817977905273 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01103108, + "balance_loss_clip": 1.00140333, + "balance_loss_mlp": 1.00011146, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6856119653181825, + "language_loss": 0.5762949, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59866869, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 4.535938024520874 + }, + { + "auxiliary_loss_clip": 0.01152981, + "auxiliary_loss_mlp": 0.01121777, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.00075531, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.7157398271732913, + "language_loss": 0.55674893, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.5794965, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.5485432147979736 + }, + { + "auxiliary_loss_clip": 0.0115294, + "auxiliary_loss_mlp": 0.0074791, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00109518, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.85634125456883, + "language_loss": 0.72164792, + "learning_rate": 2.389106271642792e-06, + "loss": 0.7406565, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.5583336353302 + }, + { + "auxiliary_loss_clip": 0.01077062, + "auxiliary_loss_mlp": 0.01121124, + "balance_loss_clip": 1.00233746, + "balance_loss_mlp": 1.00067484, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 3.213126472189403, + "language_loss": 0.69007432, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.71205616, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.7174484729766846 + }, + { + "auxiliary_loss_clip": 0.01135806, + "auxiliary_loss_mlp": 0.01120003, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00060356, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.619908765575601, + "language_loss": 0.85137784, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87393594, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.57131028175354 + }, + { + "auxiliary_loss_clip": 0.01153981, + "auxiliary_loss_mlp": 0.01120789, + "balance_loss_clip": 1.00220561, + "balance_loss_mlp": 1.00062633, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.7632535066444934, + "language_loss": 0.89323288, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91598052, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.543067216873169 + }, + { + "auxiliary_loss_clip": 0.01169375, + "auxiliary_loss_mlp": 0.00748018, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00118017, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.8015975041239116, + "language_loss": 0.71492952, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73410344, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 4.017183542251587 + }, + { + "auxiliary_loss_clip": 0.01154365, + "auxiliary_loss_mlp": 0.01121657, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00073099, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 1.8782969344630542, + "language_loss": 0.68083173, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70359194, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.562955617904663 + }, + { + "auxiliary_loss_clip": 0.01105331, + "auxiliary_loss_mlp": 0.01120929, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00057566, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.7959274564889733, + "language_loss": 0.80373263, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82599533, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.6911678314208984 + }, + { + "auxiliary_loss_clip": 0.01121045, + "auxiliary_loss_mlp": 0.0112194, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00063252, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.71417089567671, + "language_loss": 0.73960435, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76203418, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.640655994415283 + }, + { + "auxiliary_loss_clip": 0.01121412, + "auxiliary_loss_mlp": 0.01121872, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00075555, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.9794952257967176, + "language_loss": 0.80881912, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83125198, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.6897101402282715 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01122585, + "balance_loss_clip": 1.00210726, + "balance_loss_mlp": 1.00099206, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.7879218972950974, + "language_loss": 0.79480231, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81756651, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 4.007136583328247 + }, + { + "auxiliary_loss_clip": 0.01152872, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00057292, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3928160238483396, + "language_loss": 0.7488066, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77155125, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 3.972031354904175 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01121906, + "balance_loss_clip": 1.00212324, + "balance_loss_mlp": 1.00088525, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.6816538368645695, + "language_loss": 0.74666351, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76925218, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.676485300064087 + }, + { + "auxiliary_loss_clip": 0.01152968, + "auxiliary_loss_mlp": 0.01120633, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00075567, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.4552962820509159, + "language_loss": 0.81095278, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83368874, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.5521762371063232 + }, + { + "auxiliary_loss_clip": 0.01139524, + "auxiliary_loss_mlp": 0.01123148, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00098276, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.4399499324892635, + "language_loss": 0.72680998, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74943674, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.6407923698425293 + }, + { + "auxiliary_loss_clip": 0.01152794, + "auxiliary_loss_mlp": 0.01122689, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00071406, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 2.297115067811297, + "language_loss": 0.74582112, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76857603, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.619335651397705 + }, + { + "auxiliary_loss_clip": 0.01152749, + "auxiliary_loss_mlp": 0.01121967, + "balance_loss_clip": 1.00210309, + "balance_loss_mlp": 1.00066018, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.4871733299629544, + "language_loss": 0.71545327, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73820043, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.599090099334717 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01121528, + "balance_loss_clip": 1.00190938, + "balance_loss_mlp": 1.00069714, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.930120144511236, + "language_loss": 0.7316432, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75421834, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.595155954360962 + }, + { + "auxiliary_loss_clip": 0.0116943, + "auxiliary_loss_mlp": 0.01120932, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00067353, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.7495742563412326, + "language_loss": 0.65829718, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68120086, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.5414254665374756 + }, + { + "auxiliary_loss_clip": 0.01137635, + "auxiliary_loss_mlp": 0.01122252, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00084925, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.5675409535993943, + "language_loss": 0.73964024, + "learning_rate": 2.382227538303157e-06, + "loss": 0.7622391, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.5877017974853516 + }, + { + "auxiliary_loss_clip": 0.0110357, + "auxiliary_loss_mlp": 0.00747961, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00107098, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.686933081429385, + "language_loss": 0.69832832, + "learning_rate": 2.381845247976697e-06, + "loss": 0.71684361, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.7044873237609863 + }, + { + "auxiliary_loss_clip": 0.01154269, + "auxiliary_loss_mlp": 0.01120697, + "balance_loss_clip": 1.00201845, + "balance_loss_mlp": 1.00072467, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 4.810268942298498, + "language_loss": 0.78603339, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80878311, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.561302423477173 + }, + { + "auxiliary_loss_clip": 0.01169563, + "auxiliary_loss_mlp": 0.01121972, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00066507, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.4883645789719866, + "language_loss": 0.68850815, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71142352, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.6683003902435303 + }, + { + "auxiliary_loss_clip": 0.01154322, + "auxiliary_loss_mlp": 0.01121705, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00077903, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.8479305937940864, + "language_loss": 0.73367023, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75643051, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.6137454509735107 + }, + { + "auxiliary_loss_clip": 0.01169634, + "auxiliary_loss_mlp": 0.01122325, + "balance_loss_clip": 1.00218976, + "balance_loss_mlp": 1.00092256, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.8334623029506052, + "language_loss": 0.72480649, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74772608, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.5217630863189697 + }, + { + "auxiliary_loss_clip": 0.01153242, + "auxiliary_loss_mlp": 0.01122368, + "balance_loss_clip": 1.00223756, + "balance_loss_mlp": 1.00077438, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.9036264331281307, + "language_loss": 0.72793537, + "learning_rate": 2.379933579440195e-06, + "loss": 0.75069147, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.58207631111145 + }, + { + "auxiliary_loss_clip": 0.01122619, + "auxiliary_loss_mlp": 0.01121694, + "balance_loss_clip": 1.00216281, + "balance_loss_mlp": 1.00067282, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.394301352032044, + "language_loss": 0.68117988, + "learning_rate": 2.379551202453541e-06, + "loss": 0.703623, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.7974765300750732 + }, + { + "auxiliary_loss_clip": 0.01169501, + "auxiliary_loss_mlp": 0.0112105, + "balance_loss_clip": 1.00215518, + "balance_loss_mlp": 1.00069654, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.4124816960366555, + "language_loss": 0.76170492, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78461051, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.5169925689697266 + }, + { + "auxiliary_loss_clip": 0.01139034, + "auxiliary_loss_mlp": 0.01120785, + "balance_loss_clip": 1.00215983, + "balance_loss_mlp": 1.00052643, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.6245247171094173, + "language_loss": 0.77969015, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80228829, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.6316401958465576 + }, + { + "auxiliary_loss_clip": 0.01139366, + "auxiliary_loss_mlp": 0.01122321, + "balance_loss_clip": 1.00208426, + "balance_loss_mlp": 1.00101376, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 1.7072418807974044, + "language_loss": 0.69285643, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71547329, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.5681488513946533 + }, + { + "auxiliary_loss_clip": 0.01152604, + "auxiliary_loss_mlp": 0.01121309, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00076497, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.584295049372721, + "language_loss": 0.79491317, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81765229, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.541959285736084 + }, + { + "auxiliary_loss_clip": 0.01154735, + "auxiliary_loss_mlp": 0.01121533, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00070286, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 3.8981790949194983, + "language_loss": 0.62604189, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64880455, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.6380980014801025 + }, + { + "auxiliary_loss_clip": 0.01137673, + "auxiliary_loss_mlp": 0.01120825, + "balance_loss_clip": 1.0020951, + "balance_loss_mlp": 1.00066161, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.794106930773488, + "language_loss": 0.72746223, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75004721, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.592863082885742 + }, + { + "auxiliary_loss_clip": 0.0113739, + "auxiliary_loss_mlp": 0.01122097, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00078917, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 1.7566853665033597, + "language_loss": 0.76817834, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79077315, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.5791990756988525 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01121189, + "balance_loss_clip": 1.0021069, + "balance_loss_mlp": 1.00064504, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 1.94779295654068, + "language_loss": 0.6948334, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71743834, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.6010305881500244 + }, + { + "auxiliary_loss_clip": 0.01154316, + "auxiliary_loss_mlp": 0.01121032, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.0006783, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.0371745826625265, + "language_loss": 0.83406126, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.85681474, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.603313446044922 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.0074654, + "balance_loss_clip": 1.00150037, + "balance_loss_mlp": 1.00074613, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7873669992475503, + "language_loss": 0.52789605, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54686975, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.1531593799591064 + }, + { + "auxiliary_loss_clip": 0.01121389, + "auxiliary_loss_mlp": 0.01121339, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00060415, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.0275399062776573, + "language_loss": 0.8705734, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89300066, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 4.03955864906311 + }, + { + "auxiliary_loss_clip": 0.01154461, + "auxiliary_loss_mlp": 0.01122153, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00094104, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.8119554496098411, + "language_loss": 0.77291596, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79568207, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.5375490188598633 + }, + { + "auxiliary_loss_clip": 0.01152601, + "auxiliary_loss_mlp": 0.01120705, + "balance_loss_clip": 1.00202179, + "balance_loss_mlp": 1.00054193, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.6741852133507298, + "language_loss": 0.78009796, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80283105, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.5446174144744873 + }, + { + "auxiliary_loss_clip": 0.01169432, + "auxiliary_loss_mlp": 0.01119954, + "balance_loss_clip": 1.00217032, + "balance_loss_mlp": 1.00064969, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.2880508805408875, + "language_loss": 0.71243727, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73533118, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.5591044425964355 + }, + { + "auxiliary_loss_clip": 0.01120563, + "auxiliary_loss_mlp": 0.01119884, + "balance_loss_clip": 1.00193918, + "balance_loss_mlp": 1.0008651, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 2.2722393760876085, + "language_loss": 0.69605148, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71845603, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.634995698928833 + }, + { + "auxiliary_loss_clip": 0.01087728, + "auxiliary_loss_mlp": 0.01121598, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00076747, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.753184519847544, + "language_loss": 0.78571272, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80780596, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.703192710876465 + }, + { + "auxiliary_loss_clip": 0.01137986, + "auxiliary_loss_mlp": 0.01121753, + "balance_loss_clip": 1.00212193, + "balance_loss_mlp": 1.00082707, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.709164097064701, + "language_loss": 0.71346486, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73606217, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 4.15989351272583 + }, + { + "auxiliary_loss_clip": 0.01154502, + "auxiliary_loss_mlp": 0.01121975, + "balance_loss_clip": 1.00218296, + "balance_loss_mlp": 1.00066805, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.7709130842694347, + "language_loss": 0.72998583, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75275058, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.5873782634735107 + }, + { + "auxiliary_loss_clip": 0.01152191, + "auxiliary_loss_mlp": 0.01120547, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00086093, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.962961996868548, + "language_loss": 0.83447027, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85719758, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.526280403137207 + }, + { + "auxiliary_loss_clip": 0.01136474, + "auxiliary_loss_mlp": 0.01121232, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00068772, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.591860305451769, + "language_loss": 0.85641706, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87899417, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.6105282306671143 + }, + { + "auxiliary_loss_clip": 0.0110427, + "auxiliary_loss_mlp": 0.01120803, + "balance_loss_clip": 1.00183403, + "balance_loss_mlp": 1.00063992, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.7469077286533363, + "language_loss": 0.73597169, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75822246, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 4.107832908630371 + }, + { + "auxiliary_loss_clip": 0.01120996, + "auxiliary_loss_mlp": 0.01121333, + "balance_loss_clip": 1.00193012, + "balance_loss_mlp": 1.00078893, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 1.9518224967017737, + "language_loss": 0.80014241, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82256567, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.652972936630249 + }, + { + "auxiliary_loss_clip": 0.01122735, + "auxiliary_loss_mlp": 0.01121288, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00074339, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.8516044153594844, + "language_loss": 0.80950236, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83194262, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 4.10842490196228 + }, + { + "auxiliary_loss_clip": 0.01137508, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00076461, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 2.368978716432916, + "language_loss": 0.68183404, + "learning_rate": 2.370369870345559e-06, + "loss": 0.7044155, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.6029930114746094 + }, + { + "auxiliary_loss_clip": 0.0113637, + "auxiliary_loss_mlp": 0.01121495, + "balance_loss_clip": 1.0021975, + "balance_loss_mlp": 1.00095057, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.768044314829577, + "language_loss": 0.80709708, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82967579, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.6346821784973145 + }, + { + "auxiliary_loss_clip": 0.01152705, + "auxiliary_loss_mlp": 0.01121085, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00073159, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.3139128917668668, + "language_loss": 0.8254357, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84817362, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.567594051361084 + }, + { + "auxiliary_loss_clip": 0.01153691, + "auxiliary_loss_mlp": 0.01120481, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.00050902, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 3.761547978169577, + "language_loss": 0.73686469, + "learning_rate": 2.369221630917819e-06, + "loss": 0.75960642, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.66355299949646 + }, + { + "auxiliary_loss_clip": 0.01137592, + "auxiliary_loss_mlp": 0.01120484, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00070286, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.5069480827062833, + "language_loss": 0.84937954, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87196028, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.585530996322632 + }, + { + "auxiliary_loss_clip": 0.01121061, + "auxiliary_loss_mlp": 0.01121189, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00054944, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.0441107969419847, + "language_loss": 0.75993115, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.78235364, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.599717140197754 + }, + { + "auxiliary_loss_clip": 0.01169303, + "auxiliary_loss_mlp": 0.01120603, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.00063038, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.607999342788109, + "language_loss": 0.74584687, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76874596, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.514305830001831 + }, + { + "auxiliary_loss_clip": 0.01133453, + "auxiliary_loss_mlp": 0.01102345, + "balance_loss_clip": 1.00143266, + "balance_loss_mlp": 1.00011063, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7823351989182808, + "language_loss": 0.57690871, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59926665, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.1014671325683594 + }, + { + "auxiliary_loss_clip": 0.01137913, + "auxiliary_loss_mlp": 0.00747853, + "balance_loss_clip": 1.00202858, + "balance_loss_mlp": 1.00110853, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.7847354662884058, + "language_loss": 0.70530558, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72416329, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.5811543464660645 + }, + { + "auxiliary_loss_clip": 0.0116943, + "auxiliary_loss_mlp": 0.01120622, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00074553, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 1.9928719103490489, + "language_loss": 0.76302207, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78592259, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.4971671104431152 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01121191, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.00093257, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 2.4172889398801245, + "language_loss": 0.76987541, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79229891, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.6215384006500244 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.01120385, + "balance_loss_clip": 1.00223982, + "balance_loss_mlp": 1.00089002, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.8066322933539192, + "language_loss": 0.72131228, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74421078, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.4955012798309326 + }, + { + "auxiliary_loss_clip": 0.011367, + "auxiliary_loss_mlp": 0.0111965, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00053644, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.7021722805294273, + "language_loss": 0.78272021, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80528367, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.7802436351776123 + }, + { + "auxiliary_loss_clip": 0.01150962, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_clip": 1.00149012, + "balance_loss_mlp": 1.0001303, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7715965302974868, + "language_loss": 0.65021873, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67275196, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.1310410499572754 + }, + { + "auxiliary_loss_clip": 0.01152834, + "auxiliary_loss_mlp": 0.01120918, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00046861, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.636941362507514, + "language_loss": 0.79538214, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81811965, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.5950348377227783 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01120923, + "balance_loss_clip": 1.00200641, + "balance_loss_mlp": 1.00066447, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.910137101138436, + "language_loss": 0.71161067, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.73388076, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.649348735809326 + }, + { + "auxiliary_loss_clip": 0.01137882, + "auxiliary_loss_mlp": 0.01120014, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00070977, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 1.715446286158401, + "language_loss": 0.72936499, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75194395, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.613276481628418 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01119991, + "balance_loss_clip": 1.00228429, + "balance_loss_mlp": 1.00059116, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.0488229769242, + "language_loss": 0.78445917, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80722219, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.5533339977264404 + }, + { + "auxiliary_loss_clip": 0.01169426, + "auxiliary_loss_mlp": 0.01121589, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.00066352, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.8648674782422159, + "language_loss": 0.84960496, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87251514, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.48458194732666 + }, + { + "auxiliary_loss_clip": 0.01169549, + "auxiliary_loss_mlp": 0.01121691, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.00057399, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 2.360396197322804, + "language_loss": 0.69168597, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71459836, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.586104393005371 + }, + { + "auxiliary_loss_clip": 0.01154155, + "auxiliary_loss_mlp": 0.01120497, + "balance_loss_clip": 1.00199258, + "balance_loss_mlp": 1.00052524, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5171021378685656, + "language_loss": 0.78427553, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80702204, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.563063859939575 + }, + { + "auxiliary_loss_clip": 0.01137127, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00068879, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.078884182844201, + "language_loss": 0.79847383, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.8210575, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.5609822273254395 + }, + { + "auxiliary_loss_clip": 0.01135991, + "auxiliary_loss_mlp": 0.01121494, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00075889, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.553438815969945, + "language_loss": 0.72083002, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74340487, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.7181763648986816 + }, + { + "auxiliary_loss_clip": 0.01120566, + "auxiliary_loss_mlp": 0.01121393, + "balance_loss_clip": 1.00201261, + "balance_loss_mlp": 1.00075364, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.5688586310666874, + "language_loss": 0.70898229, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73140186, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.6490771770477295 + }, + { + "auxiliary_loss_clip": 0.01105891, + "auxiliary_loss_mlp": 0.00748098, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00114751, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.4132371561591706, + "language_loss": 0.6900785, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.70861834, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.6536407470703125 + }, + { + "auxiliary_loss_clip": 0.01152635, + "auxiliary_loss_mlp": 0.01120812, + "balance_loss_clip": 1.00207806, + "balance_loss_mlp": 1.00074422, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.451733413292023, + "language_loss": 0.80821455, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83094901, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 4.002928733825684 + }, + { + "auxiliary_loss_clip": 0.01153085, + "auxiliary_loss_mlp": 0.00748048, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00127161, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.744409397718065, + "language_loss": 0.81793624, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83694756, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.5504379272460938 + }, + { + "auxiliary_loss_clip": 0.01135892, + "auxiliary_loss_mlp": 0.01120469, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00078297, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 3.082834629893899, + "language_loss": 0.64835078, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67091441, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.706319808959961 + }, + { + "auxiliary_loss_clip": 0.01152971, + "auxiliary_loss_mlp": 0.01120856, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0006932, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4136190305807768, + "language_loss": 0.80538785, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82812607, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.584590196609497 + }, + { + "auxiliary_loss_clip": 0.0113969, + "auxiliary_loss_mlp": 0.01121906, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00078928, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.3523855515687428, + "language_loss": 0.75425088, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77686691, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.5962486267089844 + }, + { + "auxiliary_loss_clip": 0.01152666, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00067663, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.8170462468219424, + "language_loss": 0.74093217, + "learning_rate": 2.358881852733989e-06, + "loss": 0.7636596, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 3.9905683994293213 + }, + { + "auxiliary_loss_clip": 0.01169425, + "auxiliary_loss_mlp": 0.01121057, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00070333, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.7869893875994798, + "language_loss": 0.68014109, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70304585, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.53544545173645 + }, + { + "auxiliary_loss_clip": 0.01141871, + "auxiliary_loss_mlp": 0.01121635, + "balance_loss_clip": 1.00213313, + "balance_loss_mlp": 1.00070858, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.9524917999512377, + "language_loss": 0.75570488, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77833992, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.567568063735962 + }, + { + "auxiliary_loss_clip": 0.01137521, + "auxiliary_loss_mlp": 0.01121656, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00063515, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.8311487170728395, + "language_loss": 0.74848557, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77107733, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.5737979412078857 + }, + { + "auxiliary_loss_clip": 0.01149352, + "auxiliary_loss_mlp": 0.01102369, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00013471, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8562657292943885, + "language_loss": 0.58156371, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60408092, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 2.86997389793396 + }, + { + "auxiliary_loss_clip": 0.01152807, + "auxiliary_loss_mlp": 0.01121707, + "balance_loss_clip": 1.00201118, + "balance_loss_mlp": 1.00068545, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5302056304099998, + "language_loss": 0.93131977, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95406491, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 3.9845645427703857 + }, + { + "auxiliary_loss_clip": 0.0115288, + "auxiliary_loss_mlp": 0.01121332, + "balance_loss_clip": 1.00184286, + "balance_loss_mlp": 1.00069189, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 2.069320599813594, + "language_loss": 0.82547575, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.84821784, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.525109052658081 + }, + { + "auxiliary_loss_clip": 0.01122607, + "auxiliary_loss_mlp": 0.01102325, + "balance_loss_clip": 1.00162137, + "balance_loss_mlp": 1.00009143, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7582692313277226, + "language_loss": 0.59860921, + "learning_rate": 2.356199538526593e-06, + "loss": 0.62085855, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 4.583585739135742 + }, + { + "auxiliary_loss_clip": 0.01154569, + "auxiliary_loss_mlp": 0.01120729, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00056648, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.9465440936518679, + "language_loss": 0.72421873, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74697173, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.6061973571777344 + }, + { + "auxiliary_loss_clip": 0.0112062, + "auxiliary_loss_mlp": 0.01121586, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00056422, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.5070369831614299, + "language_loss": 0.66385609, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68627816, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.6904220581054688 + }, + { + "auxiliary_loss_clip": 0.01154184, + "auxiliary_loss_mlp": 0.01121276, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00063622, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.9506489188807523, + "language_loss": 0.78725958, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81001413, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.5992093086242676 + }, + { + "auxiliary_loss_clip": 0.01088288, + "auxiliary_loss_mlp": 0.01121084, + "balance_loss_clip": 1.00147748, + "balance_loss_mlp": 1.00063515, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.6872915667456507, + "language_loss": 0.69297075, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71506447, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.7449634075164795 + }, + { + "auxiliary_loss_clip": 0.01152675, + "auxiliary_loss_mlp": 0.01121858, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00074112, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.51922687996066, + "language_loss": 0.83955032, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86229563, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.540357828140259 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.00748037, + "balance_loss_clip": 1.00208318, + "balance_loss_mlp": 1.00117183, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 1.8364333934176793, + "language_loss": 0.75109506, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.76996136, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.5652737617492676 + }, + { + "auxiliary_loss_clip": 0.01104406, + "auxiliary_loss_mlp": 0.01121148, + "balance_loss_clip": 1.00179493, + "balance_loss_mlp": 1.00069869, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.5383072046648065, + "language_loss": 0.75798643, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78024197, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.756182909011841 + }, + { + "auxiliary_loss_clip": 0.0110412, + "auxiliary_loss_mlp": 0.01122724, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00065351, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.1473648595409607, + "language_loss": 0.66216719, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68443573, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.6770448684692383 + }, + { + "auxiliary_loss_clip": 0.01137304, + "auxiliary_loss_mlp": 0.01120759, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00059581, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.697315708941572, + "language_loss": 0.79384869, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81642932, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.6485254764556885 + }, + { + "auxiliary_loss_clip": 0.01120933, + "auxiliary_loss_mlp": 0.01120512, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00053942, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.8361094791302777, + "language_loss": 0.6771704, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69958484, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.6776044368743896 + }, + { + "auxiliary_loss_clip": 0.01139348, + "auxiliary_loss_mlp": 0.01120869, + "balance_loss_clip": 1.00192785, + "balance_loss_mlp": 1.00051522, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.8220456364656465, + "language_loss": 0.81143725, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83403951, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.6507391929626465 + }, + { + "auxiliary_loss_clip": 0.01169429, + "auxiliary_loss_mlp": 0.00748029, + "balance_loss_clip": 1.00217783, + "balance_loss_mlp": 1.00112057, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.317172321916627, + "language_loss": 0.70786035, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72703493, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.55471134185791 + }, + { + "auxiliary_loss_clip": 0.01150792, + "auxiliary_loss_mlp": 0.01102196, + "balance_loss_clip": 1.00140667, + "balance_loss_mlp": 0.99996227, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9633854586889475, + "language_loss": 0.62200236, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64453232, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.2035584449768066 + }, + { + "auxiliary_loss_clip": 0.01107487, + "auxiliary_loss_mlp": 0.01120237, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00055099, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.6367315343639142, + "language_loss": 0.68476671, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70704395, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.7549471855163574 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01121245, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00079572, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.8921436525722353, + "language_loss": 0.77006376, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.7928198, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.566371202468872 + }, + { + "auxiliary_loss_clip": 0.01154101, + "auxiliary_loss_mlp": 0.01120981, + "balance_loss_clip": 1.00211442, + "balance_loss_mlp": 1.00062776, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.7268090864031014, + "language_loss": 0.74959081, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77234161, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.6206459999084473 + }, + { + "auxiliary_loss_clip": 0.01137873, + "auxiliary_loss_mlp": 0.01121775, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.00075412, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 5.972379611105793, + "language_loss": 0.79769027, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82028675, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.5457282066345215 + }, + { + "auxiliary_loss_clip": 0.01152596, + "auxiliary_loss_mlp": 0.01119857, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.00045669, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 2.045185894207581, + "language_loss": 0.73736936, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.76009393, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.5363123416900635 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01120732, + "balance_loss_clip": 1.00195014, + "balance_loss_mlp": 1.00056911, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.6855883667930411, + "language_loss": 0.72293967, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74535131, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.6081724166870117 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01120362, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00058079, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.7527128767247986, + "language_loss": 0.78058589, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80301297, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.616641044616699 + }, + { + "auxiliary_loss_clip": 0.0112081, + "auxiliary_loss_mlp": 0.01121537, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00070655, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.2939711356268107, + "language_loss": 0.74186701, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76429045, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.750765562057495 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01120385, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.0006988, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.836863140144751, + "language_loss": 0.76350194, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78573394, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.6953282356262207 + }, + { + "auxiliary_loss_clip": 0.01103935, + "auxiliary_loss_mlp": 0.01120719, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00055659, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.6190864511745002, + "language_loss": 0.78356588, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80581236, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.6894774436950684 + }, + { + "auxiliary_loss_clip": 0.01154061, + "auxiliary_loss_mlp": 0.01120666, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.00040817, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.575764056387193, + "language_loss": 0.82952154, + "learning_rate": 2.34699803866453e-06, + "loss": 0.85226887, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.6038076877593994 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01120392, + "balance_loss_clip": 1.00201333, + "balance_loss_mlp": 1.00061071, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.7298620243126617, + "language_loss": 0.6339649, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6567055, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.5657107830047607 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_clip": 1.00148153, + "balance_loss_mlp": 1.00013661, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6810566696951262, + "language_loss": 0.55852783, + "learning_rate": 2.346230902123583e-06, + "loss": 0.58091259, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 4.664276838302612 + }, + { + "auxiliary_loss_clip": 0.01152419, + "auxiliary_loss_mlp": 0.01120342, + "balance_loss_clip": 1.00195372, + "balance_loss_mlp": 1.00065613, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.71765349672691, + "language_loss": 0.707434, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73016161, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.5255396366119385 + }, + { + "auxiliary_loss_clip": 0.01139249, + "auxiliary_loss_mlp": 0.01120201, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00061071, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.5518912635458957, + "language_loss": 0.70535481, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72794938, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.699970006942749 + }, + { + "auxiliary_loss_clip": 0.01135039, + "auxiliary_loss_mlp": 0.01121025, + "balance_loss_clip": 1.00165391, + "balance_loss_mlp": 1.00076652, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.3451373018032644, + "language_loss": 0.65386736, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67642802, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 2.747941017150879 + }, + { + "auxiliary_loss_clip": 0.01165478, + "auxiliary_loss_mlp": 0.01101534, + "balance_loss_clip": 1.00138998, + "balance_loss_mlp": 1.0000633, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7226941426292867, + "language_loss": 0.58671743, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60938758, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.0925793647766113 + }, + { + "auxiliary_loss_clip": 0.01117312, + "auxiliary_loss_mlp": 0.01101573, + "balance_loss_clip": 1.00128436, + "balance_loss_mlp": 1.00010192, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7885237308416503, + "language_loss": 0.62732673, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64951563, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.0367610454559326 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01120174, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00048828, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.168228990034547, + "language_loss": 0.76634014, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78889883, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 4.109355211257935 + }, + { + "auxiliary_loss_clip": 0.01169416, + "auxiliary_loss_mlp": 0.01120775, + "balance_loss_clip": 1.00214434, + "balance_loss_mlp": 1.00051641, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.91568899827994, + "language_loss": 0.66545308, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68835503, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.526240348815918 + }, + { + "auxiliary_loss_clip": 0.01120236, + "auxiliary_loss_mlp": 0.01120461, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00067914, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.811956819085058, + "language_loss": 0.69874382, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.72115088, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.622509717941284 + }, + { + "auxiliary_loss_clip": 0.01169364, + "auxiliary_loss_mlp": 0.01121033, + "balance_loss_clip": 1.00221562, + "balance_loss_mlp": 1.00067961, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 2.7278989965553593, + "language_loss": 0.63815773, + "learning_rate": 2.342778139478487e-06, + "loss": 0.66106164, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.5244786739349365 + }, + { + "auxiliary_loss_clip": 0.01153688, + "auxiliary_loss_mlp": 0.01120014, + "balance_loss_clip": 1.0020231, + "balance_loss_mlp": 1.00051892, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.414864662264755, + "language_loss": 0.67434424, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69708127, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 3.932354211807251 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01120968, + "balance_loss_clip": 1.0020355, + "balance_loss_mlp": 1.00051928, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.345527595996944, + "language_loss": 0.73975754, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76217759, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.7034738063812256 + }, + { + "auxiliary_loss_clip": 0.01169292, + "auxiliary_loss_mlp": 0.01120337, + "balance_loss_clip": 1.00220609, + "balance_loss_mlp": 1.00065064, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.8618963954282766, + "language_loss": 0.76569355, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78858984, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 3.9982056617736816 + }, + { + "auxiliary_loss_clip": 0.01169365, + "auxiliary_loss_mlp": 0.01121317, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 1.00067747, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 1.7014481405818833, + "language_loss": 0.79384851, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81675535, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.5066335201263428 + }, + { + "auxiliary_loss_clip": 0.01121115, + "auxiliary_loss_mlp": 0.01120895, + "balance_loss_clip": 1.00223875, + "balance_loss_mlp": 1.00073206, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.7849594778043598, + "language_loss": 0.66937524, + "learning_rate": 2.340859482393731e-06, + "loss": 0.69179529, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.7518935203552246 + }, + { + "auxiliary_loss_clip": 0.01135815, + "auxiliary_loss_mlp": 0.0074798, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.0012362, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 1.9689568599657958, + "language_loss": 0.74058694, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75942487, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.606574296951294 + }, + { + "auxiliary_loss_clip": 0.01087875, + "auxiliary_loss_mlp": 0.01120192, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00060105, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 1.8907404189891759, + "language_loss": 0.74621451, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76829523, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.742051124572754 + }, + { + "auxiliary_loss_clip": 0.01104036, + "auxiliary_loss_mlp": 0.00747926, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00115168, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.7344193380274748, + "language_loss": 0.79222512, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.81074476, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.697300910949707 + }, + { + "auxiliary_loss_clip": 0.01154417, + "auxiliary_loss_mlp": 0.01120752, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.0006839, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.8823352767351813, + "language_loss": 0.57248646, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59523815, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01152466, + "auxiliary_loss_mlp": 0.01120146, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00065076, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.1705150885908666, + "language_loss": 0.82752031, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85024643, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.527857780456543 + }, + { + "auxiliary_loss_clip": 0.0113707, + "auxiliary_loss_mlp": 0.01120141, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00054991, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 6.93810609406801, + "language_loss": 0.75349921, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77607131, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.644719123840332 + }, + { + "auxiliary_loss_clip": 0.01122716, + "auxiliary_loss_mlp": 0.01120728, + "balance_loss_clip": 1.00257206, + "balance_loss_mlp": 1.00066078, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.6137117688026992, + "language_loss": 0.73905325, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76148772, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.747603178024292 + }, + { + "auxiliary_loss_clip": 0.01120684, + "auxiliary_loss_mlp": 0.01121557, + "balance_loss_clip": 1.00177968, + "balance_loss_mlp": 1.00063109, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.624284439287314, + "language_loss": 0.85603404, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87845647, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.6229751110076904 + }, + { + "auxiliary_loss_clip": 0.01137193, + "auxiliary_loss_mlp": 0.01120806, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00064325, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 1.9277386542429364, + "language_loss": 0.79040158, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81298161, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.6331026554107666 + }, + { + "auxiliary_loss_clip": 0.01152276, + "auxiliary_loss_mlp": 0.0112044, + "balance_loss_clip": 1.0020318, + "balance_loss_mlp": 1.00056362, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7341315153314052, + "language_loss": 0.72195917, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74468637, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.5507407188415527 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01121296, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00065613, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.6932907789458649, + "language_loss": 0.69303226, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71561521, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.5730135440826416 + }, + { + "auxiliary_loss_clip": 0.01169376, + "auxiliary_loss_mlp": 0.0112094, + "balance_loss_clip": 1.00224566, + "balance_loss_mlp": 1.00068176, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.7433463488955694, + "language_loss": 0.84488684, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86778998, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.511733055114746 + }, + { + "auxiliary_loss_clip": 0.01169149, + "auxiliary_loss_mlp": 0.01120806, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.00054741, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 1.7211316680534894, + "language_loss": 0.70968628, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73258579, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.4984283447265625 + }, + { + "auxiliary_loss_clip": 0.01091322, + "auxiliary_loss_mlp": 0.01120923, + "balance_loss_clip": 1.00221276, + "balance_loss_mlp": 1.00056958, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 2.10859192981695, + "language_loss": 0.7141782, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73630065, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.7332887649536133 + }, + { + "auxiliary_loss_clip": 0.01169189, + "auxiliary_loss_mlp": 0.0074809, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00140035, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 2.0581238485574507, + "language_loss": 0.72263873, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74181145, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.554651975631714 + }, + { + "auxiliary_loss_clip": 0.01123282, + "auxiliary_loss_mlp": 0.01121357, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00071716, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.9557956708987727, + "language_loss": 0.64825642, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67070282, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.75386643409729 + }, + { + "auxiliary_loss_clip": 0.01135548, + "auxiliary_loss_mlp": 0.01119889, + "balance_loss_clip": 1.00182152, + "balance_loss_mlp": 1.00048923, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.8863751460732754, + "language_loss": 0.73279703, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75535142, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.667253255844116 + }, + { + "auxiliary_loss_clip": 0.01137963, + "auxiliary_loss_mlp": 0.01121803, + "balance_loss_clip": 1.00208282, + "balance_loss_mlp": 1.00068593, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.6812187340242333, + "language_loss": 0.68300426, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70560193, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.625290870666504 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01120792, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00043786, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 1.8973801470389535, + "language_loss": 0.80991888, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83265263, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.605215549468994 + }, + { + "auxiliary_loss_clip": 0.01156709, + "auxiliary_loss_mlp": 0.01121402, + "balance_loss_clip": 1.00244546, + "balance_loss_mlp": 1.00057197, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.677517001471617, + "language_loss": 0.76932359, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79210472, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.5291402339935303 + }, + { + "auxiliary_loss_clip": 0.01135704, + "auxiliary_loss_mlp": 0.01120982, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00072384, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.721946824975407, + "language_loss": 0.69850814, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72107494, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.5965518951416016 + }, + { + "auxiliary_loss_clip": 0.01137161, + "auxiliary_loss_mlp": 0.01121081, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00053644, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.880243011445691, + "language_loss": 0.60982621, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63240868, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.716409683227539 + }, + { + "auxiliary_loss_clip": 0.01120482, + "auxiliary_loss_mlp": 0.01120972, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00061798, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 2.3028258120933103, + "language_loss": 0.77017844, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.792593, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 4.11726975440979 + }, + { + "auxiliary_loss_clip": 0.0116938, + "auxiliary_loss_mlp": 0.01121038, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00078011, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8525998877822492, + "language_loss": 0.77383041, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79673463, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.5050582885742188 + }, + { + "auxiliary_loss_clip": 0.01154374, + "auxiliary_loss_mlp": 0.01122206, + "balance_loss_clip": 1.0021013, + "balance_loss_mlp": 1.00061297, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 2.504689141676381, + "language_loss": 0.73028123, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75304705, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.5729596614837646 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01121498, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00076246, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.3354427091833616, + "language_loss": 0.71653098, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73914593, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.6188747882843018 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01122572, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.0008831, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.671010032821497, + "language_loss": 0.73042798, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75303203, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.6405136585235596 + }, + { + "auxiliary_loss_clip": 0.01122691, + "auxiliary_loss_mlp": 0.01121649, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00062788, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.493946003013002, + "language_loss": 0.5854196, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60786301, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.6516621112823486 + }, + { + "auxiliary_loss_clip": 0.01152888, + "auxiliary_loss_mlp": 0.01120745, + "balance_loss_clip": 1.0020777, + "balance_loss_mlp": 1.0006777, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 1.944010843505477, + "language_loss": 0.70340925, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72614563, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 4.0165183544158936 + }, + { + "auxiliary_loss_clip": 0.0116956, + "auxiliary_loss_mlp": 0.01122089, + "balance_loss_clip": 1.00217915, + "balance_loss_mlp": 1.00078213, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 1.9271898768062865, + "language_loss": 0.68294907, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70586562, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.531445264816284 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.01121547, + "balance_loss_clip": 1.00211954, + "balance_loss_mlp": 1.00052571, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.6684199762420415, + "language_loss": 0.81089139, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83380032, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.549834728240967 + }, + { + "auxiliary_loss_clip": 0.01169386, + "auxiliary_loss_mlp": 0.01121517, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00068665, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.7339152853288167, + "language_loss": 0.73313922, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.7560482, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.5136923789978027 + }, + { + "auxiliary_loss_clip": 0.01169265, + "auxiliary_loss_mlp": 0.00748056, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00121129, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.624779968981006, + "language_loss": 0.7059527, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72512591, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.633918285369873 + }, + { + "auxiliary_loss_clip": 0.0112371, + "auxiliary_loss_mlp": 0.01121273, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00072825, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 2.7117131622440276, + "language_loss": 0.86564517, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88809502, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 4.056005001068115 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01101553, + "balance_loss_clip": 1.0012933, + "balance_loss_mlp": 1.00008154, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7088784581938086, + "language_loss": 0.55032492, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57267624, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.2150580883026123 + }, + { + "auxiliary_loss_clip": 0.01137439, + "auxiliary_loss_mlp": 0.01121239, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00059938, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 1.8168590309682553, + "language_loss": 0.80242372, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.82501048, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 4.059915542602539 + }, + { + "auxiliary_loss_clip": 0.01169355, + "auxiliary_loss_mlp": 0.01121425, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00049925, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 4.226166946742872, + "language_loss": 0.77973306, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.8026408, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.5421550273895264 + }, + { + "auxiliary_loss_clip": 0.01038285, + "auxiliary_loss_mlp": 0.0112051, + "balance_loss_clip": 1.00151622, + "balance_loss_mlp": 1.00053811, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.453646275280702, + "language_loss": 0.68575299, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70734096, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 2.9840407371520996 + }, + { + "auxiliary_loss_clip": 0.0115277, + "auxiliary_loss_mlp": 0.01121517, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00068617, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.4534366618173027, + "language_loss": 0.67205459, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69479746, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.9839088916778564 + }, + { + "auxiliary_loss_clip": 0.01152391, + "auxiliary_loss_mlp": 0.01119569, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00064611, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.6498964456278868, + "language_loss": 0.6522103, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67492992, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 2.638808250427246 + }, + { + "auxiliary_loss_clip": 0.01138934, + "auxiliary_loss_mlp": 0.00748119, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00139117, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.5470746824795485, + "language_loss": 0.74542356, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76429409, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.6139121055603027 + }, + { + "auxiliary_loss_clip": 0.01135699, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00081253, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.0936681545566342, + "language_loss": 0.78778768, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81035727, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.6818172931671143 + }, + { + "auxiliary_loss_clip": 0.01122093, + "auxiliary_loss_mlp": 0.01120871, + "balance_loss_clip": 1.00204778, + "balance_loss_mlp": 1.00061262, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.839432119521459, + "language_loss": 0.7592383, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78166795, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.6124267578125 + }, + { + "auxiliary_loss_clip": 0.01139107, + "auxiliary_loss_mlp": 0.01121235, + "balance_loss_clip": 1.0021292, + "balance_loss_mlp": 1.00069106, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.5512377496698222, + "language_loss": 0.80251002, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82511348, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.612170934677124 + }, + { + "auxiliary_loss_clip": 0.01169204, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_clip": 1.00207722, + "balance_loss_mlp": 1.00070167, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.4741004218614036, + "language_loss": 0.77300215, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79590476, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.502088785171509 + }, + { + "auxiliary_loss_clip": 0.01121129, + "auxiliary_loss_mlp": 0.01120802, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00054359, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.752464456810421, + "language_loss": 0.66056013, + "learning_rate": 2.323192909069061e-06, + "loss": 0.6829794, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.7554194927215576 + }, + { + "auxiliary_loss_clip": 0.01138442, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00061858, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.3198605507759846, + "language_loss": 0.7253322, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74793112, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.5719871520996094 + }, + { + "auxiliary_loss_clip": 0.01165135, + "auxiliary_loss_mlp": 0.01101605, + "balance_loss_clip": 1.00134337, + "balance_loss_mlp": 1.00013399, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.19871135331986, + "language_loss": 0.51917422, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54184163, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.052490711212158 + }, + { + "auxiliary_loss_clip": 0.01136763, + "auxiliary_loss_mlp": 0.01120992, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00044751, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.058965307188135, + "language_loss": 0.7598145, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.78239202, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.5574190616607666 + }, + { + "auxiliary_loss_clip": 0.01121038, + "auxiliary_loss_mlp": 0.01120588, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00080633, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.8899965305454831, + "language_loss": 0.70209831, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7245146, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.6099159717559814 + }, + { + "auxiliary_loss_clip": 0.01169175, + "auxiliary_loss_mlp": 0.01120376, + "balance_loss_clip": 1.00223708, + "balance_loss_mlp": 1.00059438, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6283849881815367, + "language_loss": 0.72382617, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74672163, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.501077651977539 + }, + { + "auxiliary_loss_clip": 0.01139988, + "auxiliary_loss_mlp": 0.01121838, + "balance_loss_clip": 1.00277734, + "balance_loss_mlp": 1.00062609, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 1.7796668270783451, + "language_loss": 0.83658588, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85920417, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.544923782348633 + }, + { + "auxiliary_loss_clip": 0.0114868, + "auxiliary_loss_mlp": 0.01101608, + "balance_loss_clip": 1.0013051, + "balance_loss_mlp": 1.00013661, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7674776301819082, + "language_loss": 0.57873487, + "learning_rate": 2.320502208946932e-06, + "loss": 0.60123771, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.161846876144409 + }, + { + "auxiliary_loss_clip": 0.01135624, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.00205767, + "balance_loss_mlp": 1.00087214, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 2.3257259444019067, + "language_loss": 0.84844226, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.8710022, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.576307535171509 + }, + { + "auxiliary_loss_clip": 0.01137152, + "auxiliary_loss_mlp": 0.01121325, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00059009, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.4094045522924419, + "language_loss": 0.75812805, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78071284, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.6117420196533203 + }, + { + "auxiliary_loss_clip": 0.01120863, + "auxiliary_loss_mlp": 0.01121229, + "balance_loss_clip": 1.00199604, + "balance_loss_mlp": 1.00068474, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.9855660556642203, + "language_loss": 0.8075757, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82999659, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.64540696144104 + }, + { + "auxiliary_loss_clip": 0.01136625, + "auxiliary_loss_mlp": 0.01121639, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00080895, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.9140934095891502, + "language_loss": 0.72493893, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74752152, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.595651865005493 + }, + { + "auxiliary_loss_clip": 0.01138293, + "auxiliary_loss_mlp": 0.01121173, + "balance_loss_clip": 1.00233006, + "balance_loss_mlp": 1.00062823, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.1231226868754414, + "language_loss": 0.70958084, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73217547, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.6324524879455566 + }, + { + "auxiliary_loss_clip": 0.01107289, + "auxiliary_loss_mlp": 0.01119044, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00050211, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.6876893906555581, + "language_loss": 0.85231191, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87457526, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.767369031906128 + }, + { + "auxiliary_loss_clip": 0.01153804, + "auxiliary_loss_mlp": 0.01120479, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00069761, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.3210142188846883, + "language_loss": 0.73311675, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75585949, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.5839216709136963 + }, + { + "auxiliary_loss_clip": 0.01152257, + "auxiliary_loss_mlp": 0.01120183, + "balance_loss_clip": 1.00204611, + "balance_loss_mlp": 1.00078321, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.4626598220914326, + "language_loss": 0.69870377, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72142816, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 4.3374786376953125 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01121704, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00058794, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.6131289048231012, + "language_loss": 0.67573732, + "learning_rate": 2.317041863010978e-06, + "loss": 0.6981591, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.744277238845825 + }, + { + "auxiliary_loss_clip": 0.01118754, + "auxiliary_loss_mlp": 0.01121149, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00070047, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 1.9417785695373615, + "language_loss": 0.64053297, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66293198, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.597198247909546 + }, + { + "auxiliary_loss_clip": 0.01153957, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.00081491, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 1.759635813541818, + "language_loss": 0.73674381, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.75950074, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.5212700366973877 + }, + { + "auxiliary_loss_clip": 0.01140328, + "auxiliary_loss_mlp": 0.0112167, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00064898, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 1.7809295360319948, + "language_loss": 0.74399567, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76661563, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.67533278465271 + }, + { + "auxiliary_loss_clip": 0.01119856, + "auxiliary_loss_mlp": 0.01120244, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00065327, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 2.5852892526049307, + "language_loss": 0.73952448, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.76192546, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.6069154739379883 + }, + { + "auxiliary_loss_clip": 0.01136386, + "auxiliary_loss_mlp": 0.01121384, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.0007441, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 29.211480740238247, + "language_loss": 0.6992811, + "learning_rate": 2.315119027142644e-06, + "loss": 0.72185886, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 4.0621302127838135 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00073636, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8171479832103208, + "language_loss": 0.73070681, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75326657, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.5961360931396484 + }, + { + "auxiliary_loss_clip": 0.0113782, + "auxiliary_loss_mlp": 0.01120901, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.00064254, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.5910995182177066, + "language_loss": 0.79022157, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81280881, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.6190099716186523 + }, + { + "auxiliary_loss_clip": 0.01152468, + "auxiliary_loss_mlp": 0.01120837, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.0005784, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.5681975900948097, + "language_loss": 0.72571218, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74844521, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.5562946796417236 + }, + { + "auxiliary_loss_clip": 0.0115218, + "auxiliary_loss_mlp": 0.01119671, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00055695, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.6924854308309671, + "language_loss": 0.77961969, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80233812, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 4.141287565231323 + }, + { + "auxiliary_loss_clip": 0.01120648, + "auxiliary_loss_mlp": 0.01120728, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00056517, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.4323802545127795, + "language_loss": 0.66165817, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68407202, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.6475672721862793 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.01120911, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00074792, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.6286480702245287, + "language_loss": 0.74566805, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76825005, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.5765128135681152 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01121347, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00061214, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.3972695834842899, + "language_loss": 0.77853632, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80112302, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 4.281154155731201 + }, + { + "auxiliary_loss_clip": 0.01136395, + "auxiliary_loss_mlp": 0.01119439, + "balance_loss_clip": 1.00189841, + "balance_loss_mlp": 1.00061095, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6121912648297145, + "language_loss": 0.74358708, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76614541, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.615436553955078 + }, + { + "auxiliary_loss_clip": 0.0115248, + "auxiliary_loss_mlp": 0.011217, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.0006789, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.6017916088777338, + "language_loss": 0.78925467, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81199646, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.546860933303833 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.0110151, + "balance_loss_clip": 1.00139999, + "balance_loss_mlp": 1.0000391, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7959778389789447, + "language_loss": 0.59806377, + "learning_rate": 2.311272461028297e-06, + "loss": 0.62058216, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.1858088970184326 + }, + { + "auxiliary_loss_clip": 0.01120711, + "auxiliary_loss_mlp": 0.01122055, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.0007478, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 16.221456543665784, + "language_loss": 0.78869027, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81111801, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.6122069358825684 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01120684, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00080693, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.7799014064981997, + "language_loss": 0.71732247, + "learning_rate": 2.310503005696839e-06, + "loss": 0.73974591, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.6183900833129883 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01121502, + "balance_loss_clip": 1.00166166, + "balance_loss_mlp": 1.00067127, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 1.7906682340333866, + "language_loss": 0.7801674, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80257618, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.6197588443756104 + }, + { + "auxiliary_loss_clip": 0.01153979, + "auxiliary_loss_mlp": 0.01120936, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00067759, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 3.665817433870675, + "language_loss": 0.64673704, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.66948617, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.53971791267395 + }, + { + "auxiliary_loss_clip": 0.01152521, + "auxiliary_loss_mlp": 0.01120748, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00077629, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.782068754651768, + "language_loss": 0.74173349, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76446617, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.5918381214141846 + }, + { + "auxiliary_loss_clip": 0.01135977, + "auxiliary_loss_mlp": 0.01120117, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00062215, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.5415132072813167, + "language_loss": 0.70923573, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73179668, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.5584981441497803 + }, + { + "auxiliary_loss_clip": 0.01169176, + "auxiliary_loss_mlp": 0.01120489, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00061202, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.7531187891246567, + "language_loss": 0.81086779, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83376443, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.4853789806365967 + }, + { + "auxiliary_loss_clip": 0.01165166, + "auxiliary_loss_mlp": 0.01101459, + "balance_loss_clip": 1.00140238, + "balance_loss_mlp": 0.99998778, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7915691248986351, + "language_loss": 0.55629438, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.5789606, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.1061761379241943 + }, + { + "auxiliary_loss_clip": 0.01154246, + "auxiliary_loss_mlp": 0.00748213, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.00156236, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 1.701789002905459, + "language_loss": 0.65885109, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67787564, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.6055781841278076 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01120497, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00062048, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8123059432487383, + "language_loss": 0.63460445, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65733612, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.611229181289673 + }, + { + "auxiliary_loss_clip": 0.01137663, + "auxiliary_loss_mlp": 0.01119837, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00053203, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.882222584057449, + "language_loss": 0.79806739, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82064235, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.573143720626831 + }, + { + "auxiliary_loss_clip": 0.01122387, + "auxiliary_loss_mlp": 0.01119794, + "balance_loss_clip": 1.00201285, + "balance_loss_mlp": 1.00039411, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.5908493735136038, + "language_loss": 0.77593333, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79835522, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.621161699295044 + }, + { + "auxiliary_loss_clip": 0.01137907, + "auxiliary_loss_mlp": 0.01119799, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00058985, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 2.000158765826076, + "language_loss": 0.69476128, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71733832, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.595126152038574 + }, + { + "auxiliary_loss_clip": 0.01152602, + "auxiliary_loss_mlp": 0.01119814, + "balance_loss_clip": 1.00202668, + "balance_loss_mlp": 1.00070035, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.4085454370382706, + "language_loss": 0.73786497, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.76058906, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.604991912841797 + }, + { + "auxiliary_loss_clip": 0.01152309, + "auxiliary_loss_mlp": 0.01120196, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.0005101, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.9671471255315898, + "language_loss": 0.69308937, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71581435, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.5730679035186768 + }, + { + "auxiliary_loss_clip": 0.01152561, + "auxiliary_loss_mlp": 0.01121311, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.00067163, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 2.2796645677791942, + "language_loss": 0.7355274, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75826609, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.620025873184204 + }, + { + "auxiliary_loss_clip": 0.01105484, + "auxiliary_loss_mlp": 0.01118897, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00054622, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5768593816879635, + "language_loss": 0.72462606, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74686992, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.730637311935425 + }, + { + "auxiliary_loss_clip": 0.01138971, + "auxiliary_loss_mlp": 0.0112202, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00071287, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 1.6800579719451456, + "language_loss": 0.7404865, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76309645, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.6273436546325684 + }, + { + "auxiliary_loss_clip": 0.01152479, + "auxiliary_loss_mlp": 0.01120908, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00064969, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 2.9606521072123915, + "language_loss": 0.63156098, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65429485, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.637176036834717 + }, + { + "auxiliary_loss_clip": 0.01137653, + "auxiliary_loss_mlp": 0.01120895, + "balance_loss_clip": 1.00205433, + "balance_loss_mlp": 1.00082779, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.7263436505591083, + "language_loss": 0.63251805, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65510356, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.66078519821167 + }, + { + "auxiliary_loss_clip": 0.01152676, + "auxiliary_loss_mlp": 0.01121458, + "balance_loss_clip": 1.00213993, + "balance_loss_mlp": 1.00072265, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 3.663581057405336, + "language_loss": 0.67865241, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70139372, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.5273735523223877 + }, + { + "auxiliary_loss_clip": 0.01136003, + "auxiliary_loss_mlp": 0.01120021, + "balance_loss_clip": 1.00213468, + "balance_loss_mlp": 1.00062072, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.7269350517613042, + "language_loss": 0.84689569, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86945593, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 4.035769701004028 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01119269, + "balance_loss_clip": 1.00159943, + "balance_loss_mlp": 1.00053668, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 1.8632192561293028, + "language_loss": 0.77335256, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79573375, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.5919008255004883 + }, + { + "auxiliary_loss_clip": 0.01152264, + "auxiliary_loss_mlp": 0.01118677, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00061202, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 1.8898757480105515, + "language_loss": 0.74375248, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76646191, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.5882296562194824 + }, + { + "auxiliary_loss_clip": 0.01137906, + "auxiliary_loss_mlp": 0.01120132, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00092232, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.6418197975930715, + "language_loss": 0.65157115, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67415154, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.6941349506378174 + }, + { + "auxiliary_loss_clip": 0.01152255, + "auxiliary_loss_mlp": 0.01119035, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.0006845, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.7451843980397075, + "language_loss": 0.64141095, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66412383, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.5996391773223877 + }, + { + "auxiliary_loss_clip": 0.01148705, + "auxiliary_loss_mlp": 0.01100774, + "balance_loss_clip": 1.00140929, + "balance_loss_mlp": 1.0000658, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7050808561966996, + "language_loss": 0.61872989, + "learning_rate": 2.300880877982825e-06, + "loss": 0.64122468, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 3.203145742416382 + }, + { + "auxiliary_loss_clip": 0.01122594, + "auxiliary_loss_mlp": 0.0111957, + "balance_loss_clip": 1.00211334, + "balance_loss_mlp": 1.00064635, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5988440433922428, + "language_loss": 0.78893721, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81135881, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 4.059665679931641 + }, + { + "auxiliary_loss_clip": 0.01152424, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_clip": 1.00214863, + "balance_loss_mlp": 1.00078654, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.6003730525072843, + "language_loss": 0.75083518, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77355552, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.610142707824707 + }, + { + "auxiliary_loss_clip": 0.01138749, + "auxiliary_loss_mlp": 0.01119501, + "balance_loss_clip": 1.00200403, + "balance_loss_mlp": 1.00076866, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.4330426172203727, + "language_loss": 0.68391883, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70650131, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.01152448, + "auxiliary_loss_mlp": 0.0074811, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00150323, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 2.5269910184515862, + "language_loss": 0.73806089, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75706649, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.6341423988342285 + }, + { + "auxiliary_loss_clip": 0.01121095, + "auxiliary_loss_mlp": 0.01120245, + "balance_loss_clip": 1.0019933, + "balance_loss_mlp": 1.0006541, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.4980236864888694, + "language_loss": 0.63032049, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65273392, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 4.051521301269531 + }, + { + "auxiliary_loss_clip": 0.01124325, + "auxiliary_loss_mlp": 0.01119828, + "balance_loss_clip": 1.00209987, + "balance_loss_mlp": 1.00052333, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 2.2905988582178884, + "language_loss": 0.68292117, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70536268, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.7415947914123535 + }, + { + "auxiliary_loss_clip": 0.0116912, + "auxiliary_loss_mlp": 0.00748069, + "balance_loss_clip": 1.00209987, + "balance_loss_mlp": 1.00141501, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.7679333018252221, + "language_loss": 0.70550352, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72467542, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.5547292232513428 + }, + { + "auxiliary_loss_clip": 0.01140623, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.000669, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 1.9251562893682412, + "language_loss": 0.67122471, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69383359, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 3.982701301574707 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01100703, + "balance_loss_clip": 1.00139356, + "balance_loss_mlp": 0.99999523, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9340432698968817, + "language_loss": 0.64503479, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66754574, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.2927112579345703 + }, + { + "auxiliary_loss_clip": 0.01121236, + "auxiliary_loss_mlp": 0.01119407, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00067413, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.368109967576189, + "language_loss": 0.72334445, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74575084, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.6696956157684326 + }, + { + "auxiliary_loss_clip": 0.01169108, + "auxiliary_loss_mlp": 0.01119175, + "balance_loss_clip": 1.0022049, + "balance_loss_mlp": 1.00072837, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.6896560640209781, + "language_loss": 0.72286463, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74574745, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.5366835594177246 + }, + { + "auxiliary_loss_clip": 0.01122385, + "auxiliary_loss_mlp": 0.0112018, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00068486, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 6.769511016070093, + "language_loss": 0.62857515, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.6510008, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.62096905708313 + }, + { + "auxiliary_loss_clip": 0.0116912, + "auxiliary_loss_mlp": 0.01120971, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00080836, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.8636037913933283, + "language_loss": 0.73740005, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76030099, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.5582456588745117 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.00748048, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00132549, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.5854616865682072, + "language_loss": 0.77728528, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.7961241, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.591200590133667 + }, + { + "auxiliary_loss_clip": 0.01136106, + "auxiliary_loss_mlp": 0.01120047, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00045645, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8514479367655647, + "language_loss": 0.76989591, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79245746, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.5720643997192383 + }, + { + "auxiliary_loss_clip": 0.01169378, + "auxiliary_loss_mlp": 0.0112088, + "balance_loss_clip": 1.00216854, + "balance_loss_mlp": 1.00081289, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.8190784762014534, + "language_loss": 0.82608646, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84898907, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.589407444000244 + }, + { + "auxiliary_loss_clip": 0.01135693, + "auxiliary_loss_mlp": 0.01120196, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00079632, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.6253251111255123, + "language_loss": 0.77404392, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79660285, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.7314884662628174 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.01121027, + "balance_loss_clip": 1.00200117, + "balance_loss_mlp": 1.0005784, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.6551055646612036, + "language_loss": 0.51482594, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53739947, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.5563321113586426 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01100729, + "balance_loss_clip": 1.00128269, + "balance_loss_mlp": 1.00002086, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.7760090118176541, + "language_loss": 0.57776183, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59976393, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.102269172668457 + }, + { + "auxiliary_loss_clip": 0.01106784, + "auxiliary_loss_mlp": 0.01120199, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00070381, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 1.910328814990508, + "language_loss": 0.71423352, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7365033, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.6384308338165283 + }, + { + "auxiliary_loss_clip": 0.01152604, + "auxiliary_loss_mlp": 0.01120136, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00064099, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 1.9424382691277802, + "language_loss": 0.80722558, + "learning_rate": 2.29279277055369e-06, + "loss": 0.82995296, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.5669167041778564 + }, + { + "auxiliary_loss_clip": 0.01152793, + "auxiliary_loss_mlp": 0.01120883, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00081515, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.4769053643579872, + "language_loss": 0.80403876, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82677549, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.5559942722320557 + }, + { + "auxiliary_loss_clip": 0.01109154, + "auxiliary_loss_mlp": 0.0111943, + "balance_loss_clip": 1.0021739, + "balance_loss_mlp": 1.0007925, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.6074970467717697, + "language_loss": 0.74025512, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76254106, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.668351888656616 + }, + { + "auxiliary_loss_clip": 0.01135929, + "auxiliary_loss_mlp": 0.01119857, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00055242, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.083857094222412, + "language_loss": 0.84517413, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86773199, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.5339207649230957 + }, + { + "auxiliary_loss_clip": 0.01138974, + "auxiliary_loss_mlp": 0.01119314, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00086737, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.76389223384892, + "language_loss": 0.81731993, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83990282, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.570277452468872 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.0111974, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00062585, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 1.9388455836931453, + "language_loss": 0.77817082, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.80038762, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.669602394104004 + }, + { + "auxiliary_loss_clip": 0.01165228, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_clip": 1.001472, + "balance_loss_mlp": 1.00004041, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8427443268285675, + "language_loss": 0.59273428, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61539406, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.0934598445892334 + }, + { + "auxiliary_loss_clip": 0.01137021, + "auxiliary_loss_mlp": 0.01119761, + "balance_loss_clip": 1.00203311, + "balance_loss_mlp": 1.00055194, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.6595340602942996, + "language_loss": 0.7935071, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81607497, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.6184511184692383 + }, + { + "auxiliary_loss_clip": 0.01169157, + "auxiliary_loss_mlp": 0.01120136, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.00064087, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.495440860729249, + "language_loss": 0.83533072, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85822368, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.5153982639312744 + }, + { + "auxiliary_loss_clip": 0.01124036, + "auxiliary_loss_mlp": 0.01120479, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00060225, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 1.9080092330849152, + "language_loss": 0.75474775, + "learning_rate": 2.289324932042186e-06, + "loss": 0.77719295, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.606900215148926 + }, + { + "auxiliary_loss_clip": 0.01152682, + "auxiliary_loss_mlp": 0.01120048, + "balance_loss_clip": 1.00203741, + "balance_loss_mlp": 1.00064826, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 2.3748915911024127, + "language_loss": 0.73877043, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76149774, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.532675266265869 + }, + { + "auxiliary_loss_clip": 0.01169127, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.00066519, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.8067747864397325, + "language_loss": 0.88711226, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91000038, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 4.068840980529785 + }, + { + "auxiliary_loss_clip": 0.01152348, + "auxiliary_loss_mlp": 0.01120321, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00053954, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.4651997808551875, + "language_loss": 0.79495144, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81767821, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.576375722885132 + }, + { + "auxiliary_loss_clip": 0.01132711, + "auxiliary_loss_mlp": 0.01100866, + "balance_loss_clip": 1.00140405, + "balance_loss_mlp": 1.00015855, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6932579230380027, + "language_loss": 0.56662321, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58895898, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.2329132556915283 + }, + { + "auxiliary_loss_clip": 0.01137733, + "auxiliary_loss_mlp": 0.01120317, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00072658, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.8045567562837728, + "language_loss": 0.80840427, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83098483, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.5639612674713135 + }, + { + "auxiliary_loss_clip": 0.01137049, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.00201309, + "balance_loss_mlp": 1.00071931, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.6523990359515792, + "language_loss": 0.66728836, + "learning_rate": 2.287012545338324e-06, + "loss": 0.6898582, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.6426515579223633 + }, + { + "auxiliary_loss_clip": 0.01137329, + "auxiliary_loss_mlp": 0.01119718, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00069916, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.6686187105283377, + "language_loss": 0.8386094, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86117989, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.5651543140411377 + }, + { + "auxiliary_loss_clip": 0.01133902, + "auxiliary_loss_mlp": 0.01100758, + "balance_loss_clip": 1.00136673, + "balance_loss_mlp": 1.00004983, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8064958938398767, + "language_loss": 0.55675614, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57910275, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 4.5885796546936035 + }, + { + "auxiliary_loss_clip": 0.0116899, + "auxiliary_loss_mlp": 0.01119748, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00063396, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.7482835672858241, + "language_loss": 0.80900633, + "learning_rate": 2.285856204861245e-06, + "loss": 0.83189368, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.5074856281280518 + }, + { + "auxiliary_loss_clip": 0.01169161, + "auxiliary_loss_mlp": 0.01119342, + "balance_loss_clip": 1.0021975, + "balance_loss_mlp": 1.00070465, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3586508415549556, + "language_loss": 0.75979316, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78267825, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.5755927562713623 + }, + { + "auxiliary_loss_clip": 0.01121751, + "auxiliary_loss_mlp": 0.01119275, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00063765, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.8306826304386914, + "language_loss": 0.78934836, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81175864, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.60725736618042 + }, + { + "auxiliary_loss_clip": 0.01125969, + "auxiliary_loss_mlp": 0.01120919, + "balance_loss_clip": 1.00226688, + "balance_loss_mlp": 1.0006609, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.5388465520261607, + "language_loss": 0.75686473, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77933359, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.6852474212646484 + }, + { + "auxiliary_loss_clip": 0.01135624, + "auxiliary_loss_mlp": 0.01118693, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.0005331, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.3917447819423634, + "language_loss": 0.74525803, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76780117, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.591386079788208 + }, + { + "auxiliary_loss_clip": 0.01152402, + "auxiliary_loss_mlp": 0.01120228, + "balance_loss_clip": 1.00208127, + "balance_loss_mlp": 1.00073278, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.4653057312456663, + "language_loss": 0.75742316, + "learning_rate": 2.283928754133762e-06, + "loss": 0.78014952, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 4.42058253288269 + }, + { + "auxiliary_loss_clip": 0.01102559, + "auxiliary_loss_mlp": 0.01119994, + "balance_loss_clip": 1.00191557, + "balance_loss_mlp": 1.00087976, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3995031247770768, + "language_loss": 0.66025931, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68248481, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 4.2920238971710205 + }, + { + "auxiliary_loss_clip": 0.01150378, + "auxiliary_loss_mlp": 0.00746471, + "balance_loss_clip": 1.00148749, + "balance_loss_mlp": 1.00067568, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8686105447409765, + "language_loss": 0.62119216, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64016062, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.119495153427124 + }, + { + "auxiliary_loss_clip": 0.0112275, + "auxiliary_loss_mlp": 0.00748031, + "balance_loss_clip": 1.00186718, + "balance_loss_mlp": 1.00116587, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.6461123094155092, + "language_loss": 0.69686377, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71557158, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.664249897003174 + }, + { + "auxiliary_loss_clip": 0.01155953, + "auxiliary_loss_mlp": 0.01120077, + "balance_loss_clip": 1.00227141, + "balance_loss_mlp": 1.00058138, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.693132431300517, + "language_loss": 0.66598171, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68874204, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.5555942058563232 + }, + { + "auxiliary_loss_clip": 0.01137049, + "auxiliary_loss_mlp": 0.01120402, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.000525, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.7158615964682384, + "language_loss": 0.77406812, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79664254, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.6893835067749023 + }, + { + "auxiliary_loss_clip": 0.0111853, + "auxiliary_loss_mlp": 0.01119296, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00065935, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.3802311026301397, + "language_loss": 0.72780901, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75018728, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.7199270725250244 + }, + { + "auxiliary_loss_clip": 0.01135932, + "auxiliary_loss_mlp": 0.01120029, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.0005331, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.6192637038941022, + "language_loss": 0.74960542, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.772165, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.6123106479644775 + }, + { + "auxiliary_loss_clip": 0.01139047, + "auxiliary_loss_mlp": 0.01119643, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00072002, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.6533947130401012, + "language_loss": 0.7044667, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72705358, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.598362922668457 + }, + { + "auxiliary_loss_clip": 0.01153347, + "auxiliary_loss_mlp": 0.01119593, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00057435, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 1.9687151201073432, + "language_loss": 0.78542405, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80815351, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.5321614742279053 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01119723, + "balance_loss_clip": 1.00225377, + "balance_loss_mlp": 1.00060952, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.522965692010807, + "language_loss": 0.74213505, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76488847, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.5786774158477783 + }, + { + "auxiliary_loss_clip": 0.01135541, + "auxiliary_loss_mlp": 0.01119596, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00086379, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.4165906671663864, + "language_loss": 0.7848891, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80744046, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.5944743156433105 + }, + { + "auxiliary_loss_clip": 0.01153622, + "auxiliary_loss_mlp": 0.01119121, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00067472, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.2919553015783183, + "language_loss": 0.73145735, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75418484, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.6024973392486572 + }, + { + "auxiliary_loss_clip": 0.01152202, + "auxiliary_loss_mlp": 0.01119106, + "balance_loss_clip": 1.00201428, + "balance_loss_mlp": 1.00046873, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.4905222378721845, + "language_loss": 0.74197805, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76469111, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.01103918, + "auxiliary_loss_mlp": 0.01119904, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00079012, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.5942421910327107, + "language_loss": 0.80690777, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82914603, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.6542391777038574 + }, + { + "auxiliary_loss_clip": 0.011542, + "auxiliary_loss_mlp": 0.0112084, + "balance_loss_clip": 1.00226259, + "balance_loss_mlp": 1.00067663, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.9878221944866594, + "language_loss": 0.70547187, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72822225, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.5142807960510254 + }, + { + "auxiliary_loss_clip": 0.01120807, + "auxiliary_loss_mlp": 0.01120273, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00068235, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1751060268815934, + "language_loss": 0.69692624, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71933711, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.601045846939087 + }, + { + "auxiliary_loss_clip": 0.0108907, + "auxiliary_loss_mlp": 0.01119893, + "balance_loss_clip": 1.00193155, + "balance_loss_mlp": 1.00068343, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.7746568073406255, + "language_loss": 0.74964333, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77173287, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.6899781227111816 + }, + { + "auxiliary_loss_clip": 0.01090508, + "auxiliary_loss_mlp": 0.0112006, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.00056505, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.9951314265913234, + "language_loss": 0.76731622, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78942186, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.6836049556732178 + }, + { + "auxiliary_loss_clip": 0.01120407, + "auxiliary_loss_mlp": 0.01119487, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00065947, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.8626135729198539, + "language_loss": 0.69473457, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71713352, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.01103194, + "auxiliary_loss_mlp": 0.01100697, + "balance_loss_clip": 1.00141382, + "balance_loss_mlp": 0.99998945, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7053146231726234, + "language_loss": 0.50104743, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52308631, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.3664357662200928 + }, + { + "auxiliary_loss_clip": 0.01152593, + "auxiliary_loss_mlp": 0.01120607, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.0005393, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.920438280731703, + "language_loss": 0.63798535, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66071725, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.5503628253936768 + }, + { + "auxiliary_loss_clip": 0.01152335, + "auxiliary_loss_mlp": 0.01119732, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00071323, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.7601647390243242, + "language_loss": 0.75855458, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.78127533, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.6112358570098877 + }, + { + "auxiliary_loss_clip": 0.01135456, + "auxiliary_loss_mlp": 0.01119299, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00056636, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.6217105438828598, + "language_loss": 0.74510729, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76765484, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.653357982635498 + }, + { + "auxiliary_loss_clip": 0.01136459, + "auxiliary_loss_mlp": 0.0111904, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00078428, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4690036954356316, + "language_loss": 0.64651519, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66907012, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.6735174655914307 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.0074802, + "balance_loss_clip": 1.00205195, + "balance_loss_mlp": 1.00129163, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.4687772811134812, + "language_loss": 0.7043463, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72336566, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.5830836296081543 + }, + { + "auxiliary_loss_clip": 0.01169216, + "auxiliary_loss_mlp": 0.01120028, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00072384, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.5701264406639568, + "language_loss": 0.6173979, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64029032, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.5170061588287354 + }, + { + "auxiliary_loss_clip": 0.01135214, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.00200808, + "balance_loss_mlp": 1.00099647, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.381749202446873, + "language_loss": 0.7166329, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.73918808, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 4.118801832199097 + }, + { + "auxiliary_loss_clip": 0.01137371, + "auxiliary_loss_mlp": 0.01119769, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00074995, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8896463575846805, + "language_loss": 0.84670156, + "learning_rate": 2.273130107677896e-06, + "loss": 0.86927301, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.567697525024414 + }, + { + "auxiliary_loss_clip": 0.01169221, + "auxiliary_loss_mlp": 0.01119791, + "balance_loss_clip": 1.00219822, + "balance_loss_mlp": 1.00067735, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.9043788046561785, + "language_loss": 0.84290874, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86579883, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.5089361667633057 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.0112038, + "balance_loss_clip": 1.00207651, + "balance_loss_mlp": 1.00088429, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.8037858423564748, + "language_loss": 0.65778118, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68034351, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.6182429790496826 + }, + { + "auxiliary_loss_clip": 0.01169061, + "auxiliary_loss_mlp": 0.0111975, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00054085, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.8251512760188189, + "language_loss": 0.6491679, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67205602, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.5380020141601562 + }, + { + "auxiliary_loss_clip": 0.01137124, + "auxiliary_loss_mlp": 0.00748058, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00124049, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.7756776258149047, + "language_loss": 0.74204612, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76089799, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 4.079986810684204 + }, + { + "auxiliary_loss_clip": 0.01169066, + "auxiliary_loss_mlp": 0.01120089, + "balance_loss_clip": 1.0020442, + "balance_loss_mlp": 1.0005939, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 1.8569395046765567, + "language_loss": 0.82580221, + "learning_rate": 2.271200914239451e-06, + "loss": 0.84869373, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.52488112449646 + }, + { + "auxiliary_loss_clip": 0.01152164, + "auxiliary_loss_mlp": 0.01119446, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00052261, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.9611522351372956, + "language_loss": 0.79500753, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81772363, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.56091570854187 + }, + { + "auxiliary_loss_clip": 0.01073812, + "auxiliary_loss_mlp": 0.01119763, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00064921, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 3.948518700037278, + "language_loss": 0.74753237, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76946819, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.7424097061157227 + }, + { + "auxiliary_loss_clip": 0.01136822, + "auxiliary_loss_mlp": 0.0112018, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00068462, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.5594732438992538, + "language_loss": 0.73599321, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75856316, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.5918686389923096 + }, + { + "auxiliary_loss_clip": 0.01169268, + "auxiliary_loss_mlp": 0.01121576, + "balance_loss_clip": 1.00219572, + "balance_loss_mlp": 1.00074589, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 2.0813467761333095, + "language_loss": 0.81315511, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83606362, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 3.977888584136963 + }, + { + "auxiliary_loss_clip": 0.01154027, + "auxiliary_loss_mlp": 0.01120383, + "balance_loss_clip": 1.00206923, + "balance_loss_mlp": 1.00060177, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.5075292959190687, + "language_loss": 0.75620753, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77895164, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 3.9706828594207764 + }, + { + "auxiliary_loss_clip": 0.01119727, + "auxiliary_loss_mlp": 0.01119918, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00070882, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.6923126568798188, + "language_loss": 0.67882085, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70121729, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.7923009395599365 + }, + { + "auxiliary_loss_clip": 0.01153693, + "auxiliary_loss_mlp": 0.01119725, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00051546, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6426135756132207, + "language_loss": 0.72845864, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75119287, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 2.57415509223938 + }, + { + "auxiliary_loss_clip": 0.01137767, + "auxiliary_loss_mlp": 0.01119303, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00066638, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.376733486570476, + "language_loss": 0.6480118, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67058253, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.5767884254455566 + }, + { + "auxiliary_loss_clip": 0.01106435, + "auxiliary_loss_mlp": 0.01120417, + "balance_loss_clip": 1.00208223, + "balance_loss_mlp": 1.00063562, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.375350086233856, + "language_loss": 0.81243628, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83470476, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.7406516075134277 + }, + { + "auxiliary_loss_clip": 0.01140406, + "auxiliary_loss_mlp": 0.01119323, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00059044, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.874332559603461, + "language_loss": 0.78785568, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81045306, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.579514741897583 + }, + { + "auxiliary_loss_clip": 0.01153476, + "auxiliary_loss_mlp": 0.00748092, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00128043, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.8032265231150364, + "language_loss": 0.70752609, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72654176, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.5832483768463135 + }, + { + "auxiliary_loss_clip": 0.01119492, + "auxiliary_loss_mlp": 0.01119236, + "balance_loss_clip": 1.00198829, + "balance_loss_mlp": 1.00069475, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.4915378884923494, + "language_loss": 0.75263035, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77501762, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.6879019737243652 + }, + { + "auxiliary_loss_clip": 0.01133016, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_clip": 1.00157022, + "balance_loss_mlp": 1.00002861, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7221704565621714, + "language_loss": 0.61328024, + "learning_rate": 2.266183812641164e-06, + "loss": 0.6356101, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.1927056312561035 + }, + { + "auxiliary_loss_clip": 0.01137461, + "auxiliary_loss_mlp": 0.01119331, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00069427, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.4300496026909466, + "language_loss": 0.6814366, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70400453, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.6328771114349365 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_clip": 1.00160944, + "balance_loss_mlp": 1.00052023, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.7373250255424537, + "language_loss": 0.76732731, + "learning_rate": 2.265411798646092e-06, + "loss": 0.78922188, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.8187716007232666 + }, + { + "auxiliary_loss_clip": 0.01152169, + "auxiliary_loss_mlp": 0.01119456, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00062871, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.5770214150266804, + "language_loss": 0.76114357, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78385979, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.975355386734009 + }, + { + "auxiliary_loss_clip": 0.01136545, + "auxiliary_loss_mlp": 0.01119672, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00065303, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7051499485542503, + "language_loss": 0.72243625, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.7449984, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.5911624431610107 + }, + { + "auxiliary_loss_clip": 0.01152316, + "auxiliary_loss_mlp": 0.01120784, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00062108, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 1.7851572661635358, + "language_loss": 0.82022947, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84296048, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.5114762783050537 + }, + { + "auxiliary_loss_clip": 0.01139208, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_clip": 1.00226569, + "balance_loss_mlp": 1.0008297, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.8231762453390539, + "language_loss": 0.7361629, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75876105, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.606375217437744 + }, + { + "auxiliary_loss_clip": 0.01137025, + "auxiliary_loss_mlp": 0.01121072, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00081396, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.5542894320688583, + "language_loss": 0.73425907, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75683999, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.613800287246704 + }, + { + "auxiliary_loss_clip": 0.01153928, + "auxiliary_loss_mlp": 0.01118933, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00058222, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.6283348427516364, + "language_loss": 0.76895565, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79168427, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.530764102935791 + }, + { + "auxiliary_loss_clip": 0.01152343, + "auxiliary_loss_mlp": 0.0111936, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00072289, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.723204314102134, + "language_loss": 0.72451854, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.7472356, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.6010591983795166 + }, + { + "auxiliary_loss_clip": 0.01165086, + "auxiliary_loss_mlp": 0.01099912, + "balance_loss_clip": 1.00152063, + "balance_loss_mlp": 0.99996674, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7144945322385893, + "language_loss": 0.56052363, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58317363, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.1480846405029297 + }, + { + "auxiliary_loss_clip": 0.01152251, + "auxiliary_loss_mlp": 0.01120443, + "balance_loss_clip": 1.00207686, + "balance_loss_mlp": 1.00085258, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.0465143451837045, + "language_loss": 0.65298331, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67571026, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.5778892040252686 + }, + { + "auxiliary_loss_clip": 0.01169183, + "auxiliary_loss_mlp": 0.01120436, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00065446, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.078601601452534, + "language_loss": 0.70140493, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72430116, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.497178077697754 + }, + { + "auxiliary_loss_clip": 0.01132181, + "auxiliary_loss_mlp": 0.01099975, + "balance_loss_clip": 1.00155115, + "balance_loss_mlp": 1.0000298, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8047819347004936, + "language_loss": 0.58564895, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60797048, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.2211687564849854 + }, + { + "auxiliary_loss_clip": 0.01152538, + "auxiliary_loss_mlp": 0.01118873, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00071239, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 3.200476402257797, + "language_loss": 0.77632642, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79904056, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.5349364280700684 + }, + { + "auxiliary_loss_clip": 0.01153177, + "auxiliary_loss_mlp": 0.01119183, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00064158, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 2.274297504760849, + "language_loss": 0.74638617, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76910973, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.558340072631836 + }, + { + "auxiliary_loss_clip": 0.01153998, + "auxiliary_loss_mlp": 0.01119449, + "balance_loss_clip": 1.00214469, + "balance_loss_mlp": 1.00052595, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.7006892010750232, + "language_loss": 0.82720816, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84994268, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.540158271789551 + }, + { + "auxiliary_loss_clip": 0.01152026, + "auxiliary_loss_mlp": 0.01120407, + "balance_loss_clip": 1.00202167, + "balance_loss_mlp": 1.00062549, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.81568663400638, + "language_loss": 0.75534105, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77806544, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.542985439300537 + }, + { + "auxiliary_loss_clip": 0.01135787, + "auxiliary_loss_mlp": 0.0111962, + "balance_loss_clip": 1.00205755, + "balance_loss_mlp": 1.00079203, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 1.8961572766118353, + "language_loss": 0.63946998, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66202402, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.5638792514801025 + }, + { + "auxiliary_loss_clip": 0.01122118, + "auxiliary_loss_mlp": 0.01119327, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00068951, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.368062558408646, + "language_loss": 0.70051169, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72292614, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 4.034625291824341 + }, + { + "auxiliary_loss_clip": 0.01151854, + "auxiliary_loss_mlp": 0.01119554, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00063109, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 3.9873441078459844, + "language_loss": 0.68407434, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70678842, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.610502243041992 + }, + { + "auxiliary_loss_clip": 0.01119085, + "auxiliary_loss_mlp": 0.01119699, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00068009, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9614827717954972, + "language_loss": 0.70309174, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72547954, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.6957473754882812 + }, + { + "auxiliary_loss_clip": 0.01136591, + "auxiliary_loss_mlp": 0.0111974, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00091243, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.6346630599119707, + "language_loss": 0.73561245, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75817573, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.598336696624756 + }, + { + "auxiliary_loss_clip": 0.01120186, + "auxiliary_loss_mlp": 0.01119036, + "balance_loss_clip": 1.00201285, + "balance_loss_mlp": 1.00058913, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.6992318528774817, + "language_loss": 0.69005495, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71244717, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 4.002087831497192 + }, + { + "auxiliary_loss_clip": 0.01139075, + "auxiliary_loss_mlp": 0.01118405, + "balance_loss_clip": 1.00198138, + "balance_loss_mlp": 1.00072122, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.7511088669400339, + "language_loss": 0.71962416, + "learning_rate": 2.256917013453848e-06, + "loss": 0.742199, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.5654163360595703 + }, + { + "auxiliary_loss_clip": 0.01093221, + "auxiliary_loss_mlp": 0.01119161, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.0004282, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 2.566916367619836, + "language_loss": 0.86131942, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88344324, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.707275390625 + }, + { + "auxiliary_loss_clip": 0.011537, + "auxiliary_loss_mlp": 0.01118125, + "balance_loss_clip": 1.0020324, + "balance_loss_mlp": 1.00053716, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.4847300759193789, + "language_loss": 0.82314348, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84586167, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.593534469604492 + }, + { + "auxiliary_loss_clip": 0.01117573, + "auxiliary_loss_mlp": 0.01099916, + "balance_loss_clip": 1.00137937, + "balance_loss_mlp": 0.99997109, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6653738081194436, + "language_loss": 0.58914995, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61132485, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.3044726848602295 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01118967, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00071168, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.762811025034036, + "language_loss": 0.80811393, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83084273, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.5360236167907715 + }, + { + "auxiliary_loss_clip": 0.01152172, + "auxiliary_loss_mlp": 0.01119006, + "balance_loss_clip": 1.00203431, + "balance_loss_mlp": 1.00075078, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.6647347937789323, + "language_loss": 0.74102271, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76373446, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 3.963021755218506 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01119464, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.00082743, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.4881592446058494, + "language_loss": 0.75235629, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77492321, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 4.064457893371582 + }, + { + "auxiliary_loss_clip": 0.01152308, + "auxiliary_loss_mlp": 0.01118568, + "balance_loss_clip": 1.00210762, + "balance_loss_mlp": 1.00069416, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 2.2773004814253586, + "language_loss": 0.791062, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81377077, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.5452804565429688 + }, + { + "auxiliary_loss_clip": 0.01137536, + "auxiliary_loss_mlp": 0.00747795, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00106168, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 2.300899916843231, + "language_loss": 0.76142645, + "learning_rate": 2.253826823377983e-06, + "loss": 0.78027976, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.5928561687469482 + }, + { + "auxiliary_loss_clip": 0.0116899, + "auxiliary_loss_mlp": 0.01119579, + "balance_loss_clip": 1.00213671, + "balance_loss_mlp": 1.00084651, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4043414676167205, + "language_loss": 0.7449342, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76781994, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.539099931716919 + }, + { + "auxiliary_loss_clip": 0.01137176, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_clip": 1.00210178, + "balance_loss_mlp": 1.00063443, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.8111060373731867, + "language_loss": 0.72733891, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74989867, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.5748209953308105 + }, + { + "auxiliary_loss_clip": 0.01135633, + "auxiliary_loss_mlp": 0.01119203, + "balance_loss_clip": 1.00182903, + "balance_loss_mlp": 1.00066185, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 1.9594508233983017, + "language_loss": 0.64324737, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.6657958, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.596310615539551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01118203, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00071013, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.5820428914367, + "language_loss": 0.7645421, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.78741288, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.4842894077301025 + }, + { + "auxiliary_loss_clip": 0.01168737, + "auxiliary_loss_mlp": 0.01117753, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.0006423, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7710753822002678, + "language_loss": 0.64785999, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.67072487, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.5163111686706543 + }, + { + "auxiliary_loss_clip": 0.0111656, + "auxiliary_loss_mlp": 0.01100041, + "balance_loss_clip": 1.00162435, + "balance_loss_mlp": 1.00009561, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8485009198675266, + "language_loss": 0.65720367, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67936969, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.2272560596466064 + }, + { + "auxiliary_loss_clip": 0.01154096, + "auxiliary_loss_mlp": 0.00747915, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00108969, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.6266519152842291, + "language_loss": 0.68936628, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70838642, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.593386650085449 + }, + { + "auxiliary_loss_clip": 0.01136714, + "auxiliary_loss_mlp": 0.0111919, + "balance_loss_clip": 1.00198305, + "balance_loss_mlp": 1.00064802, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.701377053728181, + "language_loss": 0.74890667, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.77146566, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.6527156829833984 + }, + { + "auxiliary_loss_clip": 0.01136003, + "auxiliary_loss_mlp": 0.01120223, + "balance_loss_clip": 1.00204575, + "balance_loss_mlp": 1.00072801, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.5359358406810792, + "language_loss": 0.77377892, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79634118, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.638702869415283 + }, + { + "auxiliary_loss_clip": 0.01136848, + "auxiliary_loss_mlp": 0.01120022, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00081229, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 2.1225093403747266, + "language_loss": 0.78031647, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80288523, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.6448519229888916 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01119373, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00083113, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 2.2047272192389276, + "language_loss": 0.72613227, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74850965, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.598932981491089 + }, + { + "auxiliary_loss_clip": 0.01120579, + "auxiliary_loss_mlp": 0.01118565, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00088191, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.7176331030169243, + "language_loss": 0.81960833, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84199977, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.632230520248413 + }, + { + "auxiliary_loss_clip": 0.01152346, + "auxiliary_loss_mlp": 0.01119676, + "balance_loss_clip": 1.00208437, + "balance_loss_mlp": 1.0006578, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.8548144440314882, + "language_loss": 0.80486643, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82758665, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.5784976482391357 + }, + { + "auxiliary_loss_clip": 0.01135373, + "auxiliary_loss_mlp": 0.01119107, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00075579, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.5335628803582841, + "language_loss": 0.7214402, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74398506, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.640591621398926 + }, + { + "auxiliary_loss_clip": 0.01152524, + "auxiliary_loss_mlp": 0.01120411, + "balance_loss_clip": 1.00225306, + "balance_loss_mlp": 1.00072455, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 2.2712264064889487, + "language_loss": 0.68573004, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70845938, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.5790610313415527 + }, + { + "auxiliary_loss_clip": 0.01120356, + "auxiliary_loss_mlp": 0.01119252, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00061536, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.407064409285451, + "language_loss": 0.68100625, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70340234, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.673774242401123 + }, + { + "auxiliary_loss_clip": 0.01169029, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_clip": 1.00216389, + "balance_loss_mlp": 1.00072622, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0653414941456605, + "language_loss": 0.78413618, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80701911, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.4762558937072754 + }, + { + "auxiliary_loss_clip": 0.01138422, + "auxiliary_loss_mlp": 0.01118953, + "balance_loss_clip": 1.00205839, + "balance_loss_mlp": 1.00088775, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 2.9817227520008296, + "language_loss": 0.66318071, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68575442, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.8150229454040527 + }, + { + "auxiliary_loss_clip": 0.01155524, + "auxiliary_loss_mlp": 0.01118252, + "balance_loss_clip": 1.00237274, + "balance_loss_mlp": 1.00056875, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.9854652773245751, + "language_loss": 0.79602873, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.81876653, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.5970427989959717 + }, + { + "auxiliary_loss_clip": 0.01139258, + "auxiliary_loss_mlp": 0.01119548, + "balance_loss_clip": 1.00216305, + "balance_loss_mlp": 1.00072026, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.7668416631856396, + "language_loss": 0.75933003, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78191805, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.5980334281921387 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.00747617, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00105596, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.7114852023848597, + "language_loss": 0.79477918, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81362045, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.56278395652771 + }, + { + "auxiliary_loss_clip": 0.01153579, + "auxiliary_loss_mlp": 0.0112004, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00083125, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.859475942360966, + "language_loss": 0.73639524, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.75913143, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.5858259201049805 + }, + { + "auxiliary_loss_clip": 0.01152271, + "auxiliary_loss_mlp": 0.01119456, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00062859, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.9907416369892958, + "language_loss": 0.80099881, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82371604, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.5490217208862305 + }, + { + "auxiliary_loss_clip": 0.0112076, + "auxiliary_loss_mlp": 0.01119956, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00084174, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.5858073008388245, + "language_loss": 0.71208823, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.7344954, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.71570086479187 + }, + { + "auxiliary_loss_clip": 0.01168949, + "auxiliary_loss_mlp": 0.01120182, + "balance_loss_clip": 1.00205338, + "balance_loss_mlp": 1.00068641, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 1.9520909129705526, + "language_loss": 0.67929089, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70218223, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 3.9707629680633545 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.01099933, + "balance_loss_clip": 1.00171995, + "balance_loss_mlp": 0.9999879, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7138809477157038, + "language_loss": 0.56342745, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58591181, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.3056912422180176 + }, + { + "auxiliary_loss_clip": 0.01136419, + "auxiliary_loss_mlp": 0.01118955, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00069904, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.557375180217258, + "language_loss": 0.88768458, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91023833, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.594817876815796 + }, + { + "auxiliary_loss_clip": 0.01153332, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_clip": 1.00195253, + "balance_loss_mlp": 1.00056005, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.7620541645128003, + "language_loss": 0.77309144, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.795811, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.5501012802124023 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01118471, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00059724, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.6744582059138367, + "language_loss": 0.84981322, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.8723532, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 4.089097261428833 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0112004, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00083041, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 1.738505233672372, + "language_loss": 0.75889802, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78147256, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.5707108974456787 + }, + { + "auxiliary_loss_clip": 0.01152648, + "auxiliary_loss_mlp": 0.01118938, + "balance_loss_clip": 1.00198901, + "balance_loss_mlp": 1.00068283, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.6862091519951865, + "language_loss": 0.64990151, + "learning_rate": 2.241846586342682e-06, + "loss": 0.67261744, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.5428216457366943 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01119675, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.00065672, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.5381795400009215, + "language_loss": 0.73560214, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75803441, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.6155753135681152 + }, + { + "auxiliary_loss_clip": 0.01152409, + "auxiliary_loss_mlp": 0.01119408, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00067604, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.239520773609403, + "language_loss": 0.68182176, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.7045399, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.5180530548095703 + }, + { + "auxiliary_loss_clip": 0.01121535, + "auxiliary_loss_mlp": 0.00747787, + "balance_loss_clip": 1.00187206, + "balance_loss_mlp": 1.00092697, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8319802356163934, + "language_loss": 0.75324118, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77193439, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.6850626468658447 + }, + { + "auxiliary_loss_clip": 0.01140379, + "auxiliary_loss_mlp": 0.0111911, + "balance_loss_clip": 1.00224209, + "balance_loss_mlp": 1.00085449, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7451308281994942, + "language_loss": 0.78888303, + "learning_rate": 2.240300098112506e-06, + "loss": 0.8114779, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 3.9906411170959473 + }, + { + "auxiliary_loss_clip": 0.01135615, + "auxiliary_loss_mlp": 0.01119339, + "balance_loss_clip": 1.00195968, + "balance_loss_mlp": 1.00070167, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7430342626428592, + "language_loss": 0.736655, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75920451, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 3.96132493019104 + }, + { + "auxiliary_loss_clip": 0.01138668, + "auxiliary_loss_mlp": 0.01118927, + "balance_loss_clip": 1.0020746, + "balance_loss_mlp": 1.00048101, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.3885927658522235, + "language_loss": 0.78014022, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80271614, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.6013717651367188 + }, + { + "auxiliary_loss_clip": 0.0113693, + "auxiliary_loss_mlp": 0.01118529, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00055981, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 4.042205818842749, + "language_loss": 0.73612773, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.75868225, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.571850538253784 + }, + { + "auxiliary_loss_clip": 0.01137447, + "auxiliary_loss_mlp": 0.01118875, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00090528, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.552962975043269, + "language_loss": 0.74518931, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76775253, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.6810576915740967 + }, + { + "auxiliary_loss_clip": 0.0112167, + "auxiliary_loss_mlp": 0.01119153, + "balance_loss_clip": 1.0018971, + "balance_loss_mlp": 1.00061095, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.8402947623529657, + "language_loss": 0.79768229, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82009059, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.6751327514648438 + }, + { + "auxiliary_loss_clip": 0.01137239, + "auxiliary_loss_mlp": 0.01119862, + "balance_loss_clip": 1.00195634, + "balance_loss_mlp": 1.00074792, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.617249947743204, + "language_loss": 0.78221136, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80478239, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.5821361541748047 + }, + { + "auxiliary_loss_clip": 0.01152264, + "auxiliary_loss_mlp": 0.0111872, + "balance_loss_clip": 1.00203156, + "balance_loss_mlp": 1.00065541, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 3.20623249140586, + "language_loss": 0.84046721, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86317706, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.494159460067749 + }, + { + "auxiliary_loss_clip": 0.01137407, + "auxiliary_loss_mlp": 0.01118885, + "balance_loss_clip": 1.00215638, + "balance_loss_mlp": 1.00081992, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.4400269463886384, + "language_loss": 0.70488465, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72744751, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.5871071815490723 + }, + { + "auxiliary_loss_clip": 0.01138592, + "auxiliary_loss_mlp": 0.01118796, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.0006355, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.6062777957041556, + "language_loss": 0.81502712, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83760095, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.581264019012451 + }, + { + "auxiliary_loss_clip": 0.01135548, + "auxiliary_loss_mlp": 0.01118829, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00066912, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 2.3254943861441144, + "language_loss": 0.84714836, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.86969215, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.593214750289917 + }, + { + "auxiliary_loss_clip": 0.011537, + "auxiliary_loss_mlp": 0.01118535, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00075662, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.508728540223599, + "language_loss": 0.79716736, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81988966, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.536501884460449 + }, + { + "auxiliary_loss_clip": 0.01123204, + "auxiliary_loss_mlp": 0.0074788, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00103128, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.8857548955286316, + "language_loss": 0.83101076, + "learning_rate": 2.235659762404047e-06, + "loss": 0.84972161, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.6601693630218506 + }, + { + "auxiliary_loss_clip": 0.01120244, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.00203335, + "balance_loss_mlp": 1.00055146, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.3913165561220056, + "language_loss": 0.72938925, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75177217, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.668823003768921 + }, + { + "auxiliary_loss_clip": 0.01118772, + "auxiliary_loss_mlp": 0.01118848, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00078273, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8673517177288428, + "language_loss": 0.77017832, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79255462, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.641761064529419 + }, + { + "auxiliary_loss_clip": 0.01118763, + "auxiliary_loss_mlp": 0.01118309, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.0005306, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6489845755792558, + "language_loss": 0.77632993, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79870069, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.600761651992798 + }, + { + "auxiliary_loss_clip": 0.01137218, + "auxiliary_loss_mlp": 0.01118714, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00055361, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.7108268972729972, + "language_loss": 0.6425097, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.66506898, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.630438804626465 + }, + { + "auxiliary_loss_clip": 0.01153833, + "auxiliary_loss_mlp": 0.01118445, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00047588, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.9144715027701134, + "language_loss": 0.77433538, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79705811, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.754483222961426 + }, + { + "auxiliary_loss_clip": 0.01152182, + "auxiliary_loss_mlp": 0.01119467, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00054383, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.8742645506502074, + "language_loss": 0.76047647, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78319299, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.561814069747925 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00085831, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.5653282464822091, + "language_loss": 0.74880147, + "learning_rate": 2.232952304022137e-06, + "loss": 0.77107394, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.6491973400115967 + }, + { + "auxiliary_loss_clip": 0.01137303, + "auxiliary_loss_mlp": 0.01118744, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00048852, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.4918367792091933, + "language_loss": 0.7292738, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75183427, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.6227242946624756 + }, + { + "auxiliary_loss_clip": 0.0113874, + "auxiliary_loss_mlp": 0.01118552, + "balance_loss_clip": 1.00205696, + "balance_loss_mlp": 1.00048709, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 3.57882752881455, + "language_loss": 0.79379475, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81636763, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.6278398036956787 + }, + { + "auxiliary_loss_clip": 0.01118365, + "auxiliary_loss_mlp": 0.01099861, + "balance_loss_clip": 1.00162399, + "balance_loss_mlp": 0.99991554, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7602725348380787, + "language_loss": 0.62225163, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64443392, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.3378121852874756 + }, + { + "auxiliary_loss_clip": 0.01118127, + "auxiliary_loss_mlp": 0.01118549, + "balance_loss_clip": 1.00190961, + "balance_loss_mlp": 1.00077057, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.6176721222577248, + "language_loss": 0.77463704, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79700381, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.6536903381347656 + }, + { + "auxiliary_loss_clip": 0.0115201, + "auxiliary_loss_mlp": 0.01117772, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00047004, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 6.240737332091214, + "language_loss": 0.70426965, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72696745, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.587197780609131 + }, + { + "auxiliary_loss_clip": 0.01108302, + "auxiliary_loss_mlp": 0.01119038, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00049663, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2662323510267681, + "language_loss": 0.79852581, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82079923, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.673628807067871 + }, + { + "auxiliary_loss_clip": 0.01152153, + "auxiliary_loss_mlp": 0.01118511, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.00054121, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.8708326768121055, + "language_loss": 0.70232725, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72503394, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.5262491703033447 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01117913, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00070643, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.8197872717111372, + "language_loss": 0.78569603, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.808406, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 3.962028741836548 + }, + { + "auxiliary_loss_clip": 0.01133634, + "auxiliary_loss_mlp": 0.0110005, + "balance_loss_clip": 1.00177586, + "balance_loss_mlp": 1.00010514, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7469418715491192, + "language_loss": 0.54055941, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56289625, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.2139933109283447 + }, + { + "auxiliary_loss_clip": 0.01138237, + "auxiliary_loss_mlp": 0.01120233, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00064182, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 1.919156186320326, + "language_loss": 0.90296894, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92555362, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.5464529991149902 + }, + { + "auxiliary_loss_clip": 0.01169101, + "auxiliary_loss_mlp": 0.01120524, + "balance_loss_clip": 1.0022316, + "balance_loss_mlp": 1.00083804, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.810810581909757, + "language_loss": 0.73329031, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.7561866, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.5393712520599365 + }, + { + "auxiliary_loss_clip": 0.01154883, + "auxiliary_loss_mlp": 0.00747958, + "balance_loss_clip": 1.00219166, + "balance_loss_mlp": 1.0010016, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5042756510408795, + "language_loss": 0.78246093, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80148929, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.5736963748931885 + }, + { + "auxiliary_loss_clip": 0.01135482, + "auxiliary_loss_mlp": 0.01119574, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00065112, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.6605507201810148, + "language_loss": 0.89403397, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91658449, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 4.003434419631958 + }, + { + "auxiliary_loss_clip": 0.01154129, + "auxiliary_loss_mlp": 0.01119348, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00061536, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.5049463092453643, + "language_loss": 0.77056217, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79329693, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.6538491249084473 + }, + { + "auxiliary_loss_clip": 0.01122144, + "auxiliary_loss_mlp": 0.01120147, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00055599, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.624027950058037, + "language_loss": 0.71610928, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73853219, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.8181047439575195 + }, + { + "auxiliary_loss_clip": 0.0116898, + "auxiliary_loss_mlp": 0.01118782, + "balance_loss_clip": 1.00226355, + "balance_loss_mlp": 1.00071692, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.8550889861739026, + "language_loss": 0.7028982, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72577584, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 2.490246295928955 + }, + { + "auxiliary_loss_clip": 0.01136942, + "auxiliary_loss_mlp": 0.01117726, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00061524, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.8483878133592642, + "language_loss": 0.71245909, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73500586, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.632249116897583 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.0074618, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.00022376, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.7992058872923092, + "language_loss": 0.59474695, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61370957, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.0795722007751465 + }, + { + "auxiliary_loss_clip": 0.0110698, + "auxiliary_loss_mlp": 0.01118266, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00086915, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.698669379854365, + "language_loss": 0.66973215, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.69198465, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 4.043967247009277 + }, + { + "auxiliary_loss_clip": 0.0113919, + "auxiliary_loss_mlp": 0.01119332, + "balance_loss_clip": 1.00201499, + "balance_loss_mlp": 1.00079083, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 2.0269653639616028, + "language_loss": 0.70159578, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72418106, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 3.9816150665283203 + }, + { + "auxiliary_loss_clip": 0.01118634, + "auxiliary_loss_mlp": 0.01119306, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00076401, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.289420070435369, + "language_loss": 0.78949773, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81187713, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.620882272720337 + }, + { + "auxiliary_loss_clip": 0.01105394, + "auxiliary_loss_mlp": 0.01118638, + "balance_loss_clip": 1.00196373, + "balance_loss_mlp": 1.00076365, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 2.125836396221228, + "language_loss": 0.7489922, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77123249, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.662858009338379 + }, + { + "auxiliary_loss_clip": 0.01120521, + "auxiliary_loss_mlp": 0.01118601, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00063121, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.8437113516354986, + "language_loss": 0.78719902, + "learning_rate": 2.224053348748365e-06, + "loss": 0.80959022, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.6255369186401367 + }, + { + "auxiliary_loss_clip": 0.01136737, + "auxiliary_loss_mlp": 0.0111913, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00077868, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 3.692283047958279, + "language_loss": 0.73436767, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75692636, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.7349491119384766 + }, + { + "auxiliary_loss_clip": 0.01150066, + "auxiliary_loss_mlp": 0.00746363, + "balance_loss_clip": 1.00168335, + "balance_loss_mlp": 1.00044465, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7596464860133015, + "language_loss": 0.59044147, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60940576, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.2245233058929443 + }, + { + "auxiliary_loss_clip": 0.01153905, + "auxiliary_loss_mlp": 0.00748017, + "balance_loss_clip": 1.00207436, + "balance_loss_mlp": 1.00111818, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 2.204890691594445, + "language_loss": 0.67180324, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69082242, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.6819825172424316 + }, + { + "auxiliary_loss_clip": 0.01137087, + "auxiliary_loss_mlp": 0.01118829, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00085914, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 2.3446923410048846, + "language_loss": 0.76143271, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78399181, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.622563362121582 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01118725, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00065994, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5219211403356019, + "language_loss": 0.78800637, + "learning_rate": 2.222118192362422e-06, + "loss": 0.81027043, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.7058284282684326 + }, + { + "auxiliary_loss_clip": 0.01138085, + "auxiliary_loss_mlp": 0.01118818, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00075328, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8843573329622088, + "language_loss": 0.79210514, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81467414, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.582822561264038 + }, + { + "auxiliary_loss_clip": 0.01106441, + "auxiliary_loss_mlp": 0.01119385, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00055766, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.3746740535697475, + "language_loss": 0.82775122, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85000944, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.7143237590789795 + }, + { + "auxiliary_loss_clip": 0.01090062, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_clip": 1.0017525, + "balance_loss_mlp": 1.00076294, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.808059548931728, + "language_loss": 0.8056345, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82772434, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.676827907562256 + }, + { + "auxiliary_loss_clip": 0.01103827, + "auxiliary_loss_mlp": 0.01119309, + "balance_loss_clip": 1.00190794, + "balance_loss_mlp": 1.00067234, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.7114996880685907, + "language_loss": 0.72237957, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74461091, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.701925039291382 + }, + { + "auxiliary_loss_clip": 0.01168876, + "auxiliary_loss_mlp": 0.01118733, + "balance_loss_clip": 1.00213397, + "balance_loss_mlp": 1.00066853, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.9338049162274211, + "language_loss": 0.70652401, + "learning_rate": 2.220182825407892e-06, + "loss": 0.72940016, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.552366018295288 + }, + { + "auxiliary_loss_clip": 0.01153932, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_clip": 1.00206721, + "balance_loss_mlp": 1.00099111, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.4464854719378577, + "language_loss": 0.7156918, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73842645, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.6054396629333496 + }, + { + "auxiliary_loss_clip": 0.01152144, + "auxiliary_loss_mlp": 0.01119028, + "balance_loss_clip": 1.00208795, + "balance_loss_mlp": 1.00096262, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.5635789078858497, + "language_loss": 0.75125593, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77396768, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.710313081741333 + }, + { + "auxiliary_loss_clip": 0.01152343, + "auxiliary_loss_mlp": 0.01120318, + "balance_loss_clip": 1.00214875, + "balance_loss_mlp": 1.00091767, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 2.379759658540822, + "language_loss": 0.81679857, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83952522, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.543537139892578 + }, + { + "auxiliary_loss_clip": 0.0115329, + "auxiliary_loss_mlp": 0.0111934, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00060725, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.9182746939383413, + "language_loss": 0.71756482, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74029112, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.6015853881835938 + }, + { + "auxiliary_loss_clip": 0.01153661, + "auxiliary_loss_mlp": 0.01117702, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00087655, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.5304292652078735, + "language_loss": 0.82639724, + "learning_rate": 2.218247249719507e-06, + "loss": 0.8491109, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.5632901191711426 + }, + { + "auxiliary_loss_clip": 0.01137501, + "auxiliary_loss_mlp": 0.01121077, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00091434, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.3359593325773154, + "language_loss": 0.77696949, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79955524, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.572338104248047 + }, + { + "auxiliary_loss_clip": 0.01152518, + "auxiliary_loss_mlp": 0.01119286, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00074434, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 4.053426143123585, + "language_loss": 0.70345926, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72617728, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.5619421005249023 + }, + { + "auxiliary_loss_clip": 0.01139157, + "auxiliary_loss_mlp": 0.0111933, + "balance_loss_clip": 1.00219357, + "balance_loss_mlp": 1.00088334, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.8848923393183077, + "language_loss": 0.70527965, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.7278645, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.6078526973724365 + }, + { + "auxiliary_loss_clip": 0.01169004, + "auxiliary_loss_mlp": 0.01119798, + "balance_loss_clip": 1.00213885, + "balance_loss_mlp": 1.00087523, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 1.9932321019113943, + "language_loss": 0.71493542, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73782343, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.5188097953796387 + }, + { + "auxiliary_loss_clip": 0.01121715, + "auxiliary_loss_mlp": 0.01120452, + "balance_loss_clip": 1.00208986, + "balance_loss_mlp": 1.00114727, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.8194743625249006, + "language_loss": 0.60979158, + "learning_rate": 2.216311467132199e-06, + "loss": 0.63221323, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.624591827392578 + }, + { + "auxiliary_loss_clip": 0.01135734, + "auxiliary_loss_mlp": 0.01099327, + "balance_loss_clip": 1.001876, + "balance_loss_mlp": 1.00014484, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8636833442334134, + "language_loss": 0.61357993, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63593054, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.1803717613220215 + }, + { + "auxiliary_loss_clip": 0.01152164, + "auxiliary_loss_mlp": 0.0111906, + "balance_loss_clip": 1.00214159, + "balance_loss_mlp": 1.0008992, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.661741825729512, + "language_loss": 0.73531389, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75802612, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.5535666942596436 + }, + { + "auxiliary_loss_clip": 0.01138526, + "auxiliary_loss_mlp": 0.01119223, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00077665, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.9673637003940325, + "language_loss": 0.79345822, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81603569, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 4.044472694396973 + }, + { + "auxiliary_loss_clip": 0.01118787, + "auxiliary_loss_mlp": 0.01119045, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00088429, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.64781053401755, + "language_loss": 0.73609221, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75847054, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.687788248062134 + }, + { + "auxiliary_loss_clip": 0.01136009, + "auxiliary_loss_mlp": 0.0111905, + "balance_loss_clip": 1.00215399, + "balance_loss_mlp": 1.00069904, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 2.1257867541539364, + "language_loss": 0.90918851, + "learning_rate": 2.214375479481094e-06, + "loss": 0.93173909, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.5899109840393066 + }, + { + "auxiliary_loss_clip": 0.0116898, + "auxiliary_loss_mlp": 0.01119031, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00096631, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 2.8906622124578685, + "language_loss": 0.74490994, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76779008, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.493539571762085 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01119727, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00070858, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.049263168288579, + "language_loss": 0.80865949, + "learning_rate": 2.213601027413894e-06, + "loss": 0.83122623, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.6147992610931396 + }, + { + "auxiliary_loss_clip": 0.01151913, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_clip": 1.00214803, + "balance_loss_mlp": 1.00084877, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 2.004896087539377, + "language_loss": 0.77275461, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79545236, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 3.95085072517395 + }, + { + "auxiliary_loss_clip": 0.01152136, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00061226, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 2.296193448339324, + "language_loss": 0.80413902, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82683766, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.5861802101135254 + }, + { + "auxiliary_loss_clip": 0.01121472, + "auxiliary_loss_mlp": 0.01119589, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.00057101, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.8121423389785143, + "language_loss": 0.76372707, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.7861377, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.668292760848999 + }, + { + "auxiliary_loss_clip": 0.01118705, + "auxiliary_loss_mlp": 0.01118524, + "balance_loss_clip": 1.0016036, + "balance_loss_mlp": 1.0008409, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.6176177518410866, + "language_loss": 0.79097021, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81334251, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.6358728408813477 + }, + { + "auxiliary_loss_clip": 0.0116872, + "auxiliary_loss_mlp": 0.01119151, + "balance_loss_clip": 1.00209022, + "balance_loss_mlp": 1.00070429, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 3.795953317584037, + "language_loss": 0.69528067, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71815944, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.4869842529296875 + }, + { + "auxiliary_loss_clip": 0.01137647, + "auxiliary_loss_mlp": 0.01120052, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00074792, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 2.483046948990416, + "language_loss": 0.62885523, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65143222, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.5949087142944336 + }, + { + "auxiliary_loss_clip": 0.01135794, + "auxiliary_loss_mlp": 0.00747914, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00112736, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.153081734158657, + "language_loss": 0.6633749, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68221194, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 3.9952023029327393 + }, + { + "auxiliary_loss_clip": 0.01075163, + "auxiliary_loss_mlp": 0.01118892, + "balance_loss_clip": 1.00170243, + "balance_loss_mlp": 1.00063658, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.7422934844462725, + "language_loss": 0.76483446, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78677499, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 4.171983242034912 + }, + { + "auxiliary_loss_clip": 0.01135769, + "auxiliary_loss_mlp": 0.01119073, + "balance_loss_clip": 1.00204778, + "balance_loss_mlp": 1.00062692, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.5416988622623506, + "language_loss": 0.75277305, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77532148, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.6065988540649414 + }, + { + "auxiliary_loss_clip": 0.01168947, + "auxiliary_loss_mlp": 0.01119058, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00070691, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.7657807599832407, + "language_loss": 0.71038908, + "learning_rate": 2.209728283441112e-06, + "loss": 0.7332691, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.525291919708252 + }, + { + "auxiliary_loss_clip": 0.01153952, + "auxiliary_loss_mlp": 0.0112009, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00078535, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 3.3431557365225144, + "language_loss": 0.74856067, + "learning_rate": 2.209340965060465e-06, + "loss": 0.77130109, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.555048942565918 + }, + { + "auxiliary_loss_clip": 0.01135289, + "auxiliary_loss_mlp": 0.0111978, + "balance_loss_clip": 1.00202596, + "balance_loss_mlp": 1.00076187, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.6502458844248915, + "language_loss": 0.67349684, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69604754, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.6087303161621094 + }, + { + "auxiliary_loss_clip": 0.01136699, + "auxiliary_loss_mlp": 0.01119185, + "balance_loss_clip": 1.00202, + "balance_loss_mlp": 1.00083423, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.423431734964477, + "language_loss": 0.73338735, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75594616, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.5597081184387207 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01119278, + "balance_loss_clip": 1.00211895, + "balance_loss_mlp": 1.00073647, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 3.025448722946151, + "language_loss": 0.84729934, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.8698678, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.609508752822876 + }, + { + "auxiliary_loss_clip": 0.01137269, + "auxiliary_loss_mlp": 0.01119182, + "balance_loss_clip": 1.00206625, + "balance_loss_mlp": 1.00083101, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 2.0391204260770084, + "language_loss": 0.73370308, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.75626761, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.5888514518737793 + }, + { + "auxiliary_loss_clip": 0.01137475, + "auxiliary_loss_mlp": 0.01120773, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.00099123, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.9114743388656952, + "language_loss": 0.71536851, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73795104, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.678635597229004 + }, + { + "auxiliary_loss_clip": 0.0115356, + "auxiliary_loss_mlp": 0.01119564, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.000736, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 2.10791343252329, + "language_loss": 0.7374711, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76020235, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.587254285812378 + }, + { + "auxiliary_loss_clip": 0.01087458, + "auxiliary_loss_mlp": 0.01120029, + "balance_loss_clip": 1.0016377, + "balance_loss_mlp": 1.00091505, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.589159978641337, + "language_loss": 0.83674955, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85882437, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.740203380584717 + }, + { + "auxiliary_loss_clip": 0.01120166, + "auxiliary_loss_mlp": 0.01118374, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00069046, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 2.054390002435856, + "language_loss": 0.79109395, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81347936, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.652202844619751 + }, + { + "auxiliary_loss_clip": 0.0113537, + "auxiliary_loss_mlp": 0.00748053, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00102115, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.7841471657619057, + "language_loss": 0.69661778, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71545202, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.74467134475708 + }, + { + "auxiliary_loss_clip": 0.0115209, + "auxiliary_loss_mlp": 0.0111967, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.00074661, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.788762210650722, + "language_loss": 0.72756338, + "learning_rate": 2.205467347074847e-06, + "loss": 0.75028104, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.5347142219543457 + }, + { + "auxiliary_loss_clip": 0.01105398, + "auxiliary_loss_mlp": 0.01120761, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00050282, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.189746795409464, + "language_loss": 0.6882773, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71053886, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.693603277206421 + }, + { + "auxiliary_loss_clip": 0.0111815, + "auxiliary_loss_mlp": 0.01119166, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00062418, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.6380970054137842, + "language_loss": 0.79092413, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81329727, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.7566702365875244 + }, + { + "auxiliary_loss_clip": 0.01152088, + "auxiliary_loss_mlp": 0.01119153, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00061107, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.8680330493147876, + "language_loss": 0.77429235, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.7970047, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.5714945793151855 + }, + { + "auxiliary_loss_clip": 0.0115224, + "auxiliary_loss_mlp": 0.01119501, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00067306, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.4348977500248468, + "language_loss": 0.75690687, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77962428, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.6661548614501953 + }, + { + "auxiliary_loss_clip": 0.01120224, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_clip": 1.0019809, + "balance_loss_mlp": 1.00071645, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.7064145001546145, + "language_loss": 0.66887927, + "learning_rate": 2.203530244988624e-06, + "loss": 0.69127023, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.675368070602417 + }, + { + "auxiliary_loss_clip": 0.01134147, + "auxiliary_loss_mlp": 0.01099331, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00014865, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6860675338542721, + "language_loss": 0.58540106, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60773581, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.2533950805664062 + }, + { + "auxiliary_loss_clip": 0.01138783, + "auxiliary_loss_mlp": 0.01120233, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00064254, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.2302812460202595, + "language_loss": 0.72083277, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74342293, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.559781074523926 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.0111943, + "balance_loss_clip": 1.00263262, + "balance_loss_mlp": 1.00060236, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.3050489834665415, + "language_loss": 0.75905222, + "learning_rate": 2.202367891004714e-06, + "loss": 0.781322, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.689258337020874 + }, + { + "auxiliary_loss_clip": 0.01101552, + "auxiliary_loss_mlp": 0.0111955, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.0006268, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 4.8419227232710815, + "language_loss": 0.6927709, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71498191, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.6660656929016113 + }, + { + "auxiliary_loss_clip": 0.01168892, + "auxiliary_loss_mlp": 0.01118521, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00074244, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.795356630414876, + "language_loss": 0.82476282, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84763694, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.538315534591675 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01118659, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.0005939, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.6241752082303675, + "language_loss": 0.80516493, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82772273, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.605969190597534 + }, + { + "auxiliary_loss_clip": 0.01152189, + "auxiliary_loss_mlp": 0.01119532, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00060844, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.7121574773763522, + "language_loss": 0.81052935, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83324659, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.5894179344177246 + }, + { + "auxiliary_loss_clip": 0.01139469, + "auxiliary_loss_mlp": 0.01118499, + "balance_loss_clip": 1.00253391, + "balance_loss_mlp": 1.00072002, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 4.570040604101012, + "language_loss": 0.72373587, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74631554, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 4.037865877151489 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.0074619, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00033557, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7009687795355746, + "language_loss": 0.56297672, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58194643, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.173137903213501 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01119826, + "balance_loss_clip": 1.00189781, + "balance_loss_mlp": 1.00061667, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.0836674104071653, + "language_loss": 0.75479978, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77719927, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.6640703678131104 + }, + { + "auxiliary_loss_clip": 0.01152204, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00055122, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 1.9339643363652124, + "language_loss": 0.65805089, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68076289, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.519503593444824 + }, + { + "auxiliary_loss_clip": 0.01152357, + "auxiliary_loss_mlp": 0.01118946, + "balance_loss_clip": 1.00206721, + "balance_loss_mlp": 1.00059474, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 2.610065629869602, + "language_loss": 0.69786996, + "learning_rate": 2.198880416254091e-06, + "loss": 0.72058296, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.634070873260498 + }, + { + "auxiliary_loss_clip": 0.01075774, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_clip": 1.00220978, + "balance_loss_mlp": 1.00059044, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.5914017103394287, + "language_loss": 0.69718033, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71911037, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 4.1551289558410645 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01119095, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00064898, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.1881135350099594, + "language_loss": 0.63305545, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65576774, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 2.5319268703460693 + }, + { + "auxiliary_loss_clip": 0.01152246, + "auxiliary_loss_mlp": 0.01118845, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.000494, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6659565239372967, + "language_loss": 0.67615426, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69886523, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.6166207790374756 + }, + { + "auxiliary_loss_clip": 0.01123576, + "auxiliary_loss_mlp": 0.01118219, + "balance_loss_clip": 1.00178921, + "balance_loss_mlp": 1.00053537, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.5258016435385346, + "language_loss": 0.8148042, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83722216, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.601353406906128 + }, + { + "auxiliary_loss_clip": 0.01153929, + "auxiliary_loss_mlp": 0.01120068, + "balance_loss_clip": 1.00216591, + "balance_loss_mlp": 1.00066829, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5891884934904719, + "language_loss": 0.80145967, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82419956, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.572164535522461 + }, + { + "auxiliary_loss_clip": 0.01169107, + "auxiliary_loss_mlp": 0.01119779, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00066471, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.077948772281148, + "language_loss": 0.66967285, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69256163, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 3.9813344478607178 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.01119864, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00084543, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.7529201742383833, + "language_loss": 0.67204303, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69476652, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 3.944186210632324 + }, + { + "auxiliary_loss_clip": 0.01136731, + "auxiliary_loss_mlp": 0.01119413, + "balance_loss_clip": 1.00199378, + "balance_loss_mlp": 1.00068045, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 1.7662691525943293, + "language_loss": 0.82315052, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84571195, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.578117847442627 + }, + { + "auxiliary_loss_clip": 0.01089923, + "auxiliary_loss_mlp": 0.01118635, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.0006659, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.634911704321033, + "language_loss": 0.74406457, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76615012, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.7398111820220947 + }, + { + "auxiliary_loss_clip": 0.01137251, + "auxiliary_loss_mlp": 0.01119752, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00063837, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.5673409647092864, + "language_loss": 0.78304732, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80561733, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.6391382217407227 + }, + { + "auxiliary_loss_clip": 0.01168855, + "auxiliary_loss_mlp": 0.00747815, + "balance_loss_clip": 1.00223434, + "balance_loss_mlp": 1.00107503, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.837173488350746, + "language_loss": 0.79131496, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81048167, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.5348105430603027 + }, + { + "auxiliary_loss_clip": 0.0115391, + "auxiliary_loss_mlp": 0.00747877, + "balance_loss_clip": 1.00211763, + "balance_loss_mlp": 1.00104213, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.5729986469274648, + "language_loss": 0.75741881, + "learning_rate": 2.194229501534644e-06, + "loss": 0.77643669, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.5504884719848633 + }, + { + "auxiliary_loss_clip": 0.01168787, + "auxiliary_loss_mlp": 0.01119106, + "balance_loss_clip": 1.00220799, + "balance_loss_mlp": 1.00056458, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.2772944962695933, + "language_loss": 0.71902543, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74190438, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.5516655445098877 + }, + { + "auxiliary_loss_clip": 0.01085465, + "auxiliary_loss_mlp": 0.01118666, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00069666, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.4632655884267884, + "language_loss": 0.79310614, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.8151474, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.6819581985473633 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01118115, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.000718, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.3649691636582661, + "language_loss": 0.84330511, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86585402, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.5822668075561523 + }, + { + "auxiliary_loss_clip": 0.01119931, + "auxiliary_loss_mlp": 0.01118132, + "balance_loss_clip": 1.00198948, + "balance_loss_mlp": 1.00063968, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.7537406518523744, + "language_loss": 0.7791816, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80156225, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.7013912200927734 + }, + { + "auxiliary_loss_clip": 0.01088147, + "auxiliary_loss_mlp": 0.01118445, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.00038004, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.5757655983752685, + "language_loss": 0.77497768, + "learning_rate": 2.192291305922943e-06, + "loss": 0.79704356, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.685558557510376 + }, + { + "auxiliary_loss_clip": 0.01086714, + "auxiliary_loss_mlp": 0.01118089, + "balance_loss_clip": 1.0016048, + "balance_loss_mlp": 1.00059581, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.8083673619947054, + "language_loss": 0.71755469, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.73960268, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.7492926120758057 + }, + { + "auxiliary_loss_clip": 0.01105891, + "auxiliary_loss_mlp": 0.01118855, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00069427, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.349522944562866, + "language_loss": 0.88037795, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.90262538, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.657684087753296 + }, + { + "auxiliary_loss_clip": 0.01121932, + "auxiliary_loss_mlp": 0.011186, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00053537, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.7977053457345646, + "language_loss": 0.61113012, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63353544, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 2.7009119987487793 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.00748084, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.00112998, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.5846580581474572, + "language_loss": 0.73119593, + "learning_rate": 2.19074061809469e-06, + "loss": 0.74989927, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.6261723041534424 + }, + { + "auxiliary_loss_clip": 0.01168681, + "auxiliary_loss_mlp": 0.01117678, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00066245, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.698510012539727, + "language_loss": 0.81892246, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84178603, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 2.923335552215576 + }, + { + "auxiliary_loss_clip": 0.01137987, + "auxiliary_loss_mlp": 0.01118169, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00077152, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.8285780336899968, + "language_loss": 0.86627793, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88883948, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.5863406658172607 + }, + { + "auxiliary_loss_clip": 0.01119519, + "auxiliary_loss_mlp": 0.01098443, + "balance_loss_clip": 1.00176132, + "balance_loss_mlp": 1.00002372, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.906752194432125, + "language_loss": 0.58474046, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60692012, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.1859793663024902 + }, + { + "auxiliary_loss_clip": 0.01168975, + "auxiliary_loss_mlp": 0.01119661, + "balance_loss_clip": 1.00222468, + "balance_loss_mlp": 1.00073814, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 2.4988744513794607, + "language_loss": 0.72081029, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74369669, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.560467004776001 + }, + { + "auxiliary_loss_clip": 0.01118983, + "auxiliary_loss_mlp": 0.01119098, + "balance_loss_clip": 1.0018971, + "balance_loss_mlp": 1.00065148, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.804486759980183, + "language_loss": 0.79918152, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.82156235, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.616539239883423 + }, + { + "auxiliary_loss_clip": 0.01136984, + "auxiliary_loss_mlp": 0.01118245, + "balance_loss_clip": 1.00200093, + "balance_loss_mlp": 1.00056183, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 1.7573841159024655, + "language_loss": 0.84202826, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86458057, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.5782952308654785 + }, + { + "auxiliary_loss_clip": 0.01153766, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.00202334, + "balance_loss_mlp": 1.00063443, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.5681451940976237, + "language_loss": 0.8325299, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85525644, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.549299716949463 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.01117637, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00052536, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.120054931813133, + "language_loss": 0.87126541, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89382333, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.5523366928100586 + }, + { + "auxiliary_loss_clip": 0.01101863, + "auxiliary_loss_mlp": 0.01117913, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00080144, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6066460849483342, + "language_loss": 0.80711877, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.8293165, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.639073133468628 + }, + { + "auxiliary_loss_clip": 0.01152167, + "auxiliary_loss_mlp": 0.01119322, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.0006845, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 1.7629452648873343, + "language_loss": 0.68300188, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70571673, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.571784019470215 + }, + { + "auxiliary_loss_clip": 0.01152143, + "auxiliary_loss_mlp": 0.01117963, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00085187, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.5195511679871125, + "language_loss": 0.77284884, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79554987, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.6203784942626953 + }, + { + "auxiliary_loss_clip": 0.01168772, + "auxiliary_loss_mlp": 0.01118438, + "balance_loss_clip": 1.00214469, + "balance_loss_mlp": 1.00065923, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.1693284500669305, + "language_loss": 0.69821787, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72108996, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.6153671741485596 + }, + { + "auxiliary_loss_clip": 0.01152033, + "auxiliary_loss_mlp": 0.01119759, + "balance_loss_clip": 1.00199997, + "balance_loss_mlp": 1.00074077, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 2.7132338474865034, + "language_loss": 0.72612321, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74884117, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 4.120286703109741 + }, + { + "auxiliary_loss_clip": 0.01136675, + "auxiliary_loss_mlp": 0.01118241, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00074852, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.5063375524097669, + "language_loss": 0.75248462, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77503371, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.584533214569092 + }, + { + "auxiliary_loss_clip": 0.01118722, + "auxiliary_loss_mlp": 0.01119065, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00071418, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.8496346539297657, + "language_loss": 0.84145045, + "learning_rate": 2.184924515731926e-06, + "loss": 0.8638283, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.6435279846191406 + }, + { + "auxiliary_loss_clip": 0.01168611, + "auxiliary_loss_mlp": 0.01118192, + "balance_loss_clip": 1.00208426, + "balance_loss_mlp": 1.00060415, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.556341880200926, + "language_loss": 0.76227427, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.7851423, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.514469623565674 + }, + { + "auxiliary_loss_clip": 0.0115384, + "auxiliary_loss_mlp": 0.01117808, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.0006969, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.6049653691827217, + "language_loss": 0.80775905, + "learning_rate": 2.184148915123631e-06, + "loss": 0.83047551, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.5766725540161133 + }, + { + "auxiliary_loss_clip": 0.01136834, + "auxiliary_loss_mlp": 0.00747996, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00109935, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4289002171645142, + "language_loss": 0.72006965, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73891789, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 4.066294431686401 + }, + { + "auxiliary_loss_clip": 0.0116861, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_clip": 1.00202024, + "balance_loss_mlp": 1.00064659, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.6371417120622862, + "language_loss": 0.68003333, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70290554, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.5323829650878906 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01119776, + "balance_loss_clip": 1.00222313, + "balance_loss_mlp": 1.0007571, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.8905840955534525, + "language_loss": 0.66134429, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68389797, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 2.568894147872925 + }, + { + "auxiliary_loss_clip": 0.01154199, + "auxiliary_loss_mlp": 0.01118562, + "balance_loss_clip": 1.00230432, + "balance_loss_mlp": 1.0006876, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.0942385984001373, + "language_loss": 0.78446847, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80719608, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.544557571411133 + }, + { + "auxiliary_loss_clip": 0.01138818, + "auxiliary_loss_mlp": 0.0111841, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.0007267, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 2.301313987495909, + "language_loss": 0.68117464, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.70374691, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 4.011326313018799 + }, + { + "auxiliary_loss_clip": 0.01138253, + "auxiliary_loss_mlp": 0.01118581, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.0008018, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4345114430032584, + "language_loss": 0.71445656, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.7370249, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.5902605056762695 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.0111947, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00064254, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7909152163545954, + "language_loss": 0.66138124, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68409806, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 4.158088445663452 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_clip": 1.00191915, + "balance_loss_mlp": 1.00073504, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.7457235305039582, + "language_loss": 0.66611797, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68834829, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.6748242378234863 + }, + { + "auxiliary_loss_clip": 0.01119714, + "auxiliary_loss_mlp": 0.01117998, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00060058, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3533574404503907, + "language_loss": 0.76661348, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78899062, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.733696222305298 + }, + { + "auxiliary_loss_clip": 0.01165011, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_clip": 1.00159311, + "balance_loss_mlp": 0.99995822, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6718632102107117, + "language_loss": 0.52349639, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54613024, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.220442056655884 + }, + { + "auxiliary_loss_clip": 0.01135844, + "auxiliary_loss_mlp": 0.01118307, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00062406, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 1.8810084095418471, + "language_loss": 0.73936695, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76190841, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.5258872509002686 + }, + { + "auxiliary_loss_clip": 0.01152148, + "auxiliary_loss_mlp": 0.0111831, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00091267, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.5665661806463138, + "language_loss": 0.63110244, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65380704, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.5773520469665527 + }, + { + "auxiliary_loss_clip": 0.01168724, + "auxiliary_loss_mlp": 0.01118116, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.00052786, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.617600120184353, + "language_loss": 0.69025254, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71312094, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.5796985626220703 + }, + { + "auxiliary_loss_clip": 0.01119123, + "auxiliary_loss_mlp": 0.01117819, + "balance_loss_clip": 1.00240588, + "balance_loss_mlp": 1.00051713, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.6748429666267866, + "language_loss": 0.73766112, + "learning_rate": 2.178718935364259e-06, + "loss": 0.76003051, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.6077423095703125 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.00747984, + "balance_loss_clip": 1.00213373, + "balance_loss_mlp": 1.00113606, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.8518015370927756, + "language_loss": 0.77040631, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78923559, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.6351068019866943 + }, + { + "auxiliary_loss_clip": 0.01104833, + "auxiliary_loss_mlp": 0.01117914, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00051689, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 2.2265132855025525, + "language_loss": 0.75639665, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77862406, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.679710865020752 + }, + { + "auxiliary_loss_clip": 0.01151914, + "auxiliary_loss_mlp": 0.01117354, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.00052857, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.6533571356900512, + "language_loss": 0.73860741, + "learning_rate": 2.177555194083212e-06, + "loss": 0.76130009, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.545652151107788 + }, + { + "auxiliary_loss_clip": 0.0115172, + "auxiliary_loss_mlp": 0.01117672, + "balance_loss_clip": 1.0019908, + "balance_loss_mlp": 1.00065589, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.6977875635096635, + "language_loss": 0.78608656, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80878043, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.5466742515563965 + }, + { + "auxiliary_loss_clip": 0.01151928, + "auxiliary_loss_mlp": 0.01118515, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.000736, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 2.171840278070875, + "language_loss": 0.72201312, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74471748, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.523151397705078 + }, + { + "auxiliary_loss_clip": 0.0115199, + "auxiliary_loss_mlp": 0.01118036, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00073385, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.710473806897698, + "language_loss": 0.76545954, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78815979, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.543088674545288 + }, + { + "auxiliary_loss_clip": 0.01151897, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.00077033, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 2.105383699363078, + "language_loss": 0.75307262, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77578086, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.580612897872925 + }, + { + "auxiliary_loss_clip": 0.01133465, + "auxiliary_loss_mlp": 0.00746357, + "balance_loss_clip": 1.00147629, + "balance_loss_mlp": 1.00063944, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7826660699335118, + "language_loss": 0.48895669, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50775492, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.0523688793182373 + }, + { + "auxiliary_loss_clip": 0.01121821, + "auxiliary_loss_mlp": 0.01118619, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00064945, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.3721281147741415, + "language_loss": 0.76722974, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78963411, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.659466028213501 + }, + { + "auxiliary_loss_clip": 0.0113507, + "auxiliary_loss_mlp": 0.01118832, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00067186, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.15371330043642, + "language_loss": 0.71947062, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74200964, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.598788022994995 + }, + { + "auxiliary_loss_clip": 0.01121311, + "auxiliary_loss_mlp": 0.01117692, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00058067, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.5490699539448958, + "language_loss": 0.62955749, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65194756, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.70125675201416 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.0111863, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00066066, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7316705573203959, + "language_loss": 0.79395497, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81650656, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.60591197013855 + }, + { + "auxiliary_loss_clip": 0.01134801, + "auxiliary_loss_mlp": 0.01118809, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00064909, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.5632990059167844, + "language_loss": 0.63207912, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65461522, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.5815227031707764 + }, + { + "auxiliary_loss_clip": 0.01087795, + "auxiliary_loss_mlp": 0.00747888, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00105786, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.6312536786335934, + "language_loss": 0.72412241, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74247926, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.736109972000122 + }, + { + "auxiliary_loss_clip": 0.01153305, + "auxiliary_loss_mlp": 0.01118534, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00056458, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.7418628624026673, + "language_loss": 0.63751328, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.66023165, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.660296678543091 + }, + { + "auxiliary_loss_clip": 0.0115383, + "auxiliary_loss_mlp": 0.01119173, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.00063121, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.9775986802903285, + "language_loss": 0.82606554, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84879553, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.5578432083129883 + }, + { + "auxiliary_loss_clip": 0.01153804, + "auxiliary_loss_mlp": 0.0111931, + "balance_loss_clip": 1.00225818, + "balance_loss_mlp": 1.00067317, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.918087666041101, + "language_loss": 0.84972084, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87245202, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.523526668548584 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01119054, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00051248, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.549600957461657, + "language_loss": 0.85601622, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87840605, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.678572177886963 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.01118795, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00073004, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 1.6946105028601575, + "language_loss": 0.78996915, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81252813, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.6008455753326416 + }, + { + "auxiliary_loss_clip": 0.01104339, + "auxiliary_loss_mlp": 0.01118612, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00064254, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 1.8024461929228657, + "language_loss": 0.72217965, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74440914, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 4.084757089614868 + }, + { + "auxiliary_loss_clip": 0.01153886, + "auxiliary_loss_mlp": 0.0111863, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.0006609, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6552085206473843, + "language_loss": 0.69078684, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.713512, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.616922616958618 + }, + { + "auxiliary_loss_clip": 0.01168751, + "auxiliary_loss_mlp": 0.01118679, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00051832, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 2.0208047242204743, + "language_loss": 0.76372802, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78660226, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.4841959476470947 + }, + { + "auxiliary_loss_clip": 0.0116862, + "auxiliary_loss_mlp": 0.01118125, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00072789, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.8352685906399324, + "language_loss": 0.75926816, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78213567, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.5268497467041016 + }, + { + "auxiliary_loss_clip": 0.01151854, + "auxiliary_loss_mlp": 0.01119086, + "balance_loss_clip": 1.00199735, + "balance_loss_mlp": 1.00064015, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.2152216864740764, + "language_loss": 0.6483472, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67105663, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.538811206817627 + }, + { + "auxiliary_loss_clip": 0.01121353, + "auxiliary_loss_mlp": 0.01118171, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00058293, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.0348815026393265, + "language_loss": 0.72476673, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74716198, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 4.059237480163574 + }, + { + "auxiliary_loss_clip": 0.01152186, + "auxiliary_loss_mlp": 0.0111875, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00078011, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.4672609048698697, + "language_loss": 0.69616115, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71887046, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.7034523487091064 + }, + { + "auxiliary_loss_clip": 0.01155064, + "auxiliary_loss_mlp": 0.01118409, + "balance_loss_clip": 1.00242376, + "balance_loss_mlp": 1.00053501, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.3543496286811751, + "language_loss": 0.70346844, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.7262032, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.6067473888397217 + }, + { + "auxiliary_loss_clip": 0.01107582, + "auxiliary_loss_mlp": 0.01117212, + "balance_loss_clip": 1.00177968, + "balance_loss_mlp": 1.00067294, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.5399656058075166, + "language_loss": 0.71171278, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73396075, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.711305856704712 + }, + { + "auxiliary_loss_clip": 0.0110292, + "auxiliary_loss_mlp": 0.0111857, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00069547, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.397751389915585, + "language_loss": 0.80273241, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82494736, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 4.046516418457031 + }, + { + "auxiliary_loss_clip": 0.01168483, + "auxiliary_loss_mlp": 0.01117582, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00066209, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.711890530960449, + "language_loss": 0.74355084, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76641154, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 4.060456275939941 + }, + { + "auxiliary_loss_clip": 0.01136698, + "auxiliary_loss_mlp": 0.01118443, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00056922, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.4785902377843927, + "language_loss": 0.73172581, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75427723, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.607600212097168 + }, + { + "auxiliary_loss_clip": 0.01069808, + "auxiliary_loss_mlp": 0.01118132, + "balance_loss_clip": 1.00167394, + "balance_loss_mlp": 1.00063896, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.8826843698202003, + "language_loss": 0.75071287, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77259231, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.6981537342071533 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.01117839, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00072813, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.49044816638202, + "language_loss": 0.74379796, + "learning_rate": 2.165914514023972e-06, + "loss": 0.7661798, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.628654956817627 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01119185, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00073862, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 2.4074006497216764, + "language_loss": 0.61932969, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64205748, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.5513744354248047 + }, + { + "auxiliary_loss_clip": 0.01118374, + "auxiliary_loss_mlp": 0.01119004, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00084352, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 1.8872315558284434, + "language_loss": 0.81596088, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.83833468, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.6081244945526123 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01118824, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00056815, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 2.370740118271874, + "language_loss": 0.72137284, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74377692, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.685246229171753 + }, + { + "auxiliary_loss_clip": 0.01168584, + "auxiliary_loss_mlp": 0.01117696, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.0005852, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.8328796863201136, + "language_loss": 0.6680575, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69092029, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.5750315189361572 + }, + { + "auxiliary_loss_clip": 0.01152828, + "auxiliary_loss_mlp": 0.00747862, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.0010637, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.4924613448446875, + "language_loss": 0.75467592, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77368283, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.677173376083374 + }, + { + "auxiliary_loss_clip": 0.01138177, + "auxiliary_loss_mlp": 0.01118258, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00067019, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 2.2454939829062357, + "language_loss": 0.75965101, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78221536, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.592381000518799 + }, + { + "auxiliary_loss_clip": 0.01136825, + "auxiliary_loss_mlp": 0.00748088, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00116122, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 2.0551993830499438, + "language_loss": 0.8030684, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82191753, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.594465732574463 + }, + { + "auxiliary_loss_clip": 0.0115163, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00069118, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.4880207174845412, + "language_loss": 0.74510038, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76779658, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.550508975982666 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01118328, + "balance_loss_clip": 1.00227368, + "balance_loss_mlp": 1.0006448, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 1.9865461381402587, + "language_loss": 0.82525939, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84783828, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.569854497909546 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01118137, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.0006448, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.8006325899122304, + "language_loss": 0.74163979, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76405036, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.5944888591766357 + }, + { + "auxiliary_loss_clip": 0.01168864, + "auxiliary_loss_mlp": 0.01119201, + "balance_loss_clip": 1.0021863, + "balance_loss_mlp": 1.0005641, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.7620427687925042, + "language_loss": 0.7587598, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78164041, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.5456371307373047 + }, + { + "auxiliary_loss_clip": 0.01135433, + "auxiliary_loss_mlp": 0.01119149, + "balance_loss_clip": 1.00207579, + "balance_loss_mlp": 1.00089312, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 1.8326409232176308, + "language_loss": 0.72511649, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74766231, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.588806629180908 + }, + { + "auxiliary_loss_clip": 0.01120042, + "auxiliary_loss_mlp": 0.01098449, + "balance_loss_clip": 1.00160956, + "balance_loss_mlp": 1.00002956, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8371233247756528, + "language_loss": 0.54337716, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56556213, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.18573260307312 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01118199, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00051582, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.5961784899946891, + "language_loss": 0.61165643, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63389909, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 2.8605570793151855 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01117959, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00075269, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.6456776675154623, + "language_loss": 0.77016628, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79253745, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.6654162406921387 + }, + { + "auxiliary_loss_clip": 0.01133352, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_clip": 1.00160336, + "balance_loss_mlp": 1.00005198, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9909582154847544, + "language_loss": 0.67068064, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69299889, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.201498508453369 + }, + { + "auxiliary_loss_clip": 0.01168817, + "auxiliary_loss_mlp": 0.01118032, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00063491, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.05939245211157, + "language_loss": 0.76669228, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78956079, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.497037887573242 + }, + { + "auxiliary_loss_clip": 0.01152004, + "auxiliary_loss_mlp": 0.01118092, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00059986, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.718989937652943, + "language_loss": 0.83699757, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85969853, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.556352138519287 + }, + { + "auxiliary_loss_clip": 0.01153284, + "auxiliary_loss_mlp": 0.01117977, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00057983, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.7652402645225045, + "language_loss": 0.80043823, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82315081, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 2.5180470943450928 + }, + { + "auxiliary_loss_clip": 0.01168916, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_clip": 1.00227904, + "balance_loss_mlp": 1.0007509, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.4352239588084343, + "language_loss": 0.69168419, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71455956, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.5457077026367188 + }, + { + "auxiliary_loss_clip": 0.01153414, + "auxiliary_loss_mlp": 0.0111818, + "balance_loss_clip": 1.00204933, + "balance_loss_mlp": 1.00059187, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.0579127006551765, + "language_loss": 0.73118138, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75389731, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.5514426231384277 + }, + { + "auxiliary_loss_clip": 0.01152257, + "auxiliary_loss_mlp": 0.01119045, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.00078952, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 2.1087789579750504, + "language_loss": 0.71496248, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73767555, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.5220563411712646 + }, + { + "auxiliary_loss_clip": 0.0110511, + "auxiliary_loss_mlp": 0.01117945, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00064301, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.6270288798963586, + "language_loss": 0.68218732, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70441794, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.6955573558807373 + }, + { + "auxiliary_loss_clip": 0.01152271, + "auxiliary_loss_mlp": 0.01119253, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00052071, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 3.247715281001962, + "language_loss": 0.63224232, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65495753, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 3.9856300354003906 + }, + { + "auxiliary_loss_clip": 0.01120964, + "auxiliary_loss_mlp": 0.01118122, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00072479, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 4.07244369366406, + "language_loss": 0.76940274, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79179358, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.606652021408081 + }, + { + "auxiliary_loss_clip": 0.01153448, + "auxiliary_loss_mlp": 0.01119243, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00051033, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.517994680433156, + "language_loss": 0.76502532, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78775227, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.556940793991089 + }, + { + "auxiliary_loss_clip": 0.01136316, + "auxiliary_loss_mlp": 0.01118643, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00067377, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.9440167613411377, + "language_loss": 0.7747153, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79726487, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.585754156112671 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01099248, + "balance_loss_clip": 1.00151038, + "balance_loss_mlp": 1.00006604, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7900539279879668, + "language_loss": 0.54163659, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56412542, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.1226646900177 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01118029, + "balance_loss_clip": 1.00178111, + "balance_loss_mlp": 1.00053596, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.0105368785985576, + "language_loss": 0.86090523, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88310301, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.648097038269043 + }, + { + "auxiliary_loss_clip": 0.01153392, + "auxiliary_loss_mlp": 0.01117504, + "balance_loss_clip": 1.00219691, + "balance_loss_mlp": 1.00067878, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6689030213800446, + "language_loss": 0.73307991, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75578886, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 3.978210210800171 + }, + { + "auxiliary_loss_clip": 0.01151701, + "auxiliary_loss_mlp": 0.01118035, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00063741, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6264567529715903, + "language_loss": 0.78362, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80631739, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.577324867248535 + }, + { + "auxiliary_loss_clip": 0.01139055, + "auxiliary_loss_mlp": 0.01117938, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00063646, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.181442619921343, + "language_loss": 0.7581917, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.7807616, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.582620143890381 + }, + { + "auxiliary_loss_clip": 0.01136969, + "auxiliary_loss_mlp": 0.01118825, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00085509, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 1.7105119445756445, + "language_loss": 0.81371534, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83627325, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 4.030738353729248 + }, + { + "auxiliary_loss_clip": 0.01148064, + "auxiliary_loss_mlp": 0.01098461, + "balance_loss_clip": 1.00144529, + "balance_loss_mlp": 1.00004232, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6892888783427982, + "language_loss": 0.53292787, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.5553931, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.1977760791778564 + }, + { + "auxiliary_loss_clip": 0.01153309, + "auxiliary_loss_mlp": 0.00748036, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.0012002, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 3.2860284574935523, + "language_loss": 0.62754571, + "learning_rate": 2.152326591972107e-06, + "loss": 0.64655918, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 3.9621286392211914 + }, + { + "auxiliary_loss_clip": 0.0112009, + "auxiliary_loss_mlp": 0.01118864, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.0007987, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.5670857002162268, + "language_loss": 0.69234347, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.714733, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.630237579345703 + }, + { + "auxiliary_loss_clip": 0.01153586, + "auxiliary_loss_mlp": 0.01118391, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.00061238, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.6233017250810375, + "language_loss": 0.74141037, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76413012, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.5718305110931396 + }, + { + "auxiliary_loss_clip": 0.01153798, + "auxiliary_loss_mlp": 0.01118715, + "balance_loss_clip": 1.00212979, + "balance_loss_mlp": 1.00064969, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.8572381432289222, + "language_loss": 0.69977987, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72250497, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.5659239292144775 + }, + { + "auxiliary_loss_clip": 0.01131356, + "auxiliary_loss_mlp": 0.00746317, + "balance_loss_clip": 1.00149846, + "balance_loss_mlp": 1.00074959, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.7055003001606288, + "language_loss": 0.46224469, + "learning_rate": 2.150773224180877e-06, + "loss": 0.4810214, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.1204679012298584 + }, + { + "auxiliary_loss_clip": 0.01168715, + "auxiliary_loss_mlp": 0.01118992, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.000736, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.8339574551849898, + "language_loss": 0.65421426, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.6770913, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.522329092025757 + }, + { + "auxiliary_loss_clip": 0.0105686, + "auxiliary_loss_mlp": 0.01118719, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00055838, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.889655466756328, + "language_loss": 0.69872862, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72048444, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.7444710731506348 + }, + { + "auxiliary_loss_clip": 0.01135217, + "auxiliary_loss_mlp": 0.01117955, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00065339, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.6272973455311501, + "language_loss": 0.8394686, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86200035, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.611424207687378 + }, + { + "auxiliary_loss_clip": 0.01168512, + "auxiliary_loss_mlp": 0.01117595, + "balance_loss_clip": 1.00219631, + "balance_loss_mlp": 1.00067496, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.03105340129738, + "language_loss": 0.73013622, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.75299728, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.5035128593444824 + }, + { + "auxiliary_loss_clip": 0.01118365, + "auxiliary_loss_mlp": 0.01117171, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00072753, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.150675164580398, + "language_loss": 0.7224077, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74476314, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.628086566925049 + }, + { + "auxiliary_loss_clip": 0.01088911, + "auxiliary_loss_mlp": 0.01119809, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00050437, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.6689713743747927, + "language_loss": 0.77258939, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79467666, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.7452194690704346 + }, + { + "auxiliary_loss_clip": 0.0113709, + "auxiliary_loss_mlp": 0.01117649, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00063372, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.5376728190983977, + "language_loss": 0.71004915, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73259658, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.5857553482055664 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01118928, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00057745, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.6745112965592546, + "language_loss": 0.75110704, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77366823, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.6173911094665527 + }, + { + "auxiliary_loss_clip": 0.0115171, + "auxiliary_loss_mlp": 0.011187, + "balance_loss_clip": 1.00203741, + "balance_loss_mlp": 1.0006355, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.0856629501275847, + "language_loss": 0.67943311, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.70213723, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.572937250137329 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01118146, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00055778, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.4169760852385562, + "language_loss": 0.66896832, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69119835, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.6692845821380615 + }, + { + "auxiliary_loss_clip": 0.01151947, + "auxiliary_loss_mlp": 0.01117436, + "balance_loss_clip": 1.00213265, + "balance_loss_mlp": 1.00070667, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.7230368688028503, + "language_loss": 0.74594247, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76863629, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.598057985305786 + }, + { + "auxiliary_loss_clip": 0.01136868, + "auxiliary_loss_mlp": 0.01117287, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00065303, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5162742613612263, + "language_loss": 0.64637923, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66892081, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.7798142433166504 + }, + { + "auxiliary_loss_clip": 0.01168688, + "auxiliary_loss_mlp": 0.01118358, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00067484, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.8972068141722778, + "language_loss": 0.71270567, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73557615, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.527109384536743 + }, + { + "auxiliary_loss_clip": 0.01168518, + "auxiliary_loss_mlp": 0.00748011, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00122249, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.7140094388068756, + "language_loss": 0.72125465, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74041992, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.6711373329162598 + }, + { + "auxiliary_loss_clip": 0.01131432, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_clip": 1.00133038, + "balance_loss_mlp": 1.00001645, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7460486227711203, + "language_loss": 0.52193457, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54422563, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.2074902057647705 + }, + { + "auxiliary_loss_clip": 0.01168611, + "auxiliary_loss_mlp": 0.01118227, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00083005, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.3984510166299975, + "language_loss": 0.76964223, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79251063, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.538918972015381 + }, + { + "auxiliary_loss_clip": 0.01138066, + "auxiliary_loss_mlp": 0.01117446, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00052559, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.8941963549792569, + "language_loss": 0.69997978, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72253489, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.62593674659729 + }, + { + "auxiliary_loss_clip": 0.01103672, + "auxiliary_loss_mlp": 0.01117831, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.00062394, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 2.0173103335855713, + "language_loss": 0.80828357, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83049858, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.7020630836486816 + }, + { + "auxiliary_loss_clip": 0.01122777, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00069761, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.9631336776591086, + "language_loss": 0.70702744, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72943807, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.664839506149292 + }, + { + "auxiliary_loss_clip": 0.01151719, + "auxiliary_loss_mlp": 0.01117165, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00062597, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.9371415276548367, + "language_loss": 0.84372187, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86641073, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.5362424850463867 + }, + { + "auxiliary_loss_clip": 0.01153798, + "auxiliary_loss_mlp": 0.0111819, + "balance_loss_clip": 1.0021193, + "balance_loss_mlp": 1.00060153, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 2.4145046555458327, + "language_loss": 0.76088262, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78360248, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.509631633758545 + }, + { + "auxiliary_loss_clip": 0.01138114, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00064707, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.3959892546485353, + "language_loss": 0.59769994, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62027109, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.6185030937194824 + }, + { + "auxiliary_loss_clip": 0.01153353, + "auxiliary_loss_mlp": 0.01117562, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00073731, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.3986567318540513, + "language_loss": 0.79300106, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81571025, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 4.0157506465911865 + }, + { + "auxiliary_loss_clip": 0.01153827, + "auxiliary_loss_mlp": 0.01118842, + "balance_loss_clip": 1.00204837, + "balance_loss_mlp": 1.00049078, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.245030351515025, + "language_loss": 0.67473495, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69746161, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.5281615257263184 + }, + { + "auxiliary_loss_clip": 0.01134758, + "auxiliary_loss_mlp": 0.01117736, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00052965, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 1.7782454829046628, + "language_loss": 0.75690699, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77943194, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.622185468673706 + }, + { + "auxiliary_loss_clip": 0.01101937, + "auxiliary_loss_mlp": 0.01118234, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00055015, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.207511086220722, + "language_loss": 0.8027786, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82498038, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.6391701698303223 + }, + { + "auxiliary_loss_clip": 0.0115173, + "auxiliary_loss_mlp": 0.01117578, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00075269, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 4.871873853033689, + "language_loss": 0.66116488, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68385792, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.53383207321167 + }, + { + "auxiliary_loss_clip": 0.01168753, + "auxiliary_loss_mlp": 0.01119066, + "balance_loss_clip": 1.00211346, + "balance_loss_mlp": 1.00062025, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 2.0286768643318736, + "language_loss": 0.66322303, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68610126, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.550330638885498 + }, + { + "auxiliary_loss_clip": 0.01122029, + "auxiliary_loss_mlp": 0.01118035, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00054276, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.6589534434349957, + "language_loss": 0.76561052, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78801119, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 4.073687553405762 + }, + { + "auxiliary_loss_clip": 0.01136136, + "auxiliary_loss_mlp": 0.01118033, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00054002, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.8195585051792906, + "language_loss": 0.60896218, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.63150388, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.667903184890747 + }, + { + "auxiliary_loss_clip": 0.01136585, + "auxiliary_loss_mlp": 0.01118259, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00076652, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.9634195949521396, + "language_loss": 0.78314537, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80569386, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.6412549018859863 + }, + { + "auxiliary_loss_clip": 0.01138463, + "auxiliary_loss_mlp": 0.00747982, + "balance_loss_clip": 1.00192368, + "balance_loss_mlp": 1.00112259, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.8020055099374392, + "language_loss": 0.79131734, + "learning_rate": 2.138343067844089e-06, + "loss": 0.8101818, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 3.9846248626708984 + }, + { + "auxiliary_loss_clip": 0.01152229, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00062108, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.660888445924316, + "language_loss": 0.80988669, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83258724, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.6124491691589355 + }, + { + "auxiliary_loss_clip": 0.01122135, + "auxiliary_loss_mlp": 0.01119037, + "balance_loss_clip": 1.00212085, + "balance_loss_mlp": 1.00087655, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.2684390467566877, + "language_loss": 0.91230762, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93471932, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 4.06518030166626 + }, + { + "auxiliary_loss_clip": 0.01121953, + "auxiliary_loss_mlp": 0.01117921, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00071502, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7261137473941146, + "language_loss": 0.64602882, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.66842753, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.6317081451416016 + }, + { + "auxiliary_loss_clip": 0.01103176, + "auxiliary_loss_mlp": 0.00747959, + "balance_loss_clip": 1.00178266, + "balance_loss_mlp": 1.00119615, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.8470040238518137, + "language_loss": 0.75653994, + "learning_rate": 2.136788910691711e-06, + "loss": 0.7750513, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.7637057304382324 + }, + { + "auxiliary_loss_clip": 0.01168649, + "auxiliary_loss_mlp": 0.01118401, + "balance_loss_clip": 1.00222528, + "balance_loss_mlp": 1.00071716, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.7570416072769424, + "language_loss": 0.84594285, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.8688134, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.5244863033294678 + }, + { + "auxiliary_loss_clip": 0.01151621, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00058377, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.4987358276390153, + "language_loss": 0.83244181, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85512638, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.65218448638916 + }, + { + "auxiliary_loss_clip": 0.01134784, + "auxiliary_loss_mlp": 0.01117706, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00059462, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.3628942176508068, + "language_loss": 0.74644965, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76897454, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.6080687046051025 + }, + { + "auxiliary_loss_clip": 0.01168568, + "auxiliary_loss_mlp": 0.00747745, + "balance_loss_clip": 1.00219584, + "balance_loss_mlp": 1.00114679, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 2.021283350187854, + "language_loss": 0.78029585, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.79945898, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.510215997695923 + }, + { + "auxiliary_loss_clip": 0.01101037, + "auxiliary_loss_mlp": 0.00747958, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00121999, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.1403362620618043, + "language_loss": 0.76650023, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78499019, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.6396427154541016 + }, + { + "auxiliary_loss_clip": 0.01134987, + "auxiliary_loss_mlp": 0.01117197, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00065804, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.81858168159693, + "language_loss": 0.6244185, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64694041, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.548654079437256 + }, + { + "auxiliary_loss_clip": 0.01168572, + "auxiliary_loss_mlp": 0.01117695, + "balance_loss_clip": 1.0021894, + "balance_loss_mlp": 1.00077462, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7201200634899252, + "language_loss": 0.72354853, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.7464112, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.506453514099121 + }, + { + "auxiliary_loss_clip": 0.0112033, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.0006156, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 2.187446856852606, + "language_loss": 0.79345119, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81582987, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.602956533432007 + }, + { + "auxiliary_loss_clip": 0.01151897, + "auxiliary_loss_mlp": 0.01118006, + "balance_loss_clip": 1.00212574, + "balance_loss_mlp": 1.00070429, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.406582159996862, + "language_loss": 0.71774745, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74044657, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.5166103839874268 + }, + { + "auxiliary_loss_clip": 0.01151995, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_clip": 1.00214159, + "balance_loss_mlp": 1.00069475, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.606733449453253, + "language_loss": 0.74758607, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77028978, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.556771755218506 + }, + { + "auxiliary_loss_clip": 0.01137274, + "auxiliary_loss_mlp": 0.0111788, + "balance_loss_clip": 1.00233614, + "balance_loss_mlp": 1.00067341, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.3918482978673974, + "language_loss": 0.63363576, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65618736, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.633824586868286 + }, + { + "auxiliary_loss_clip": 0.01136055, + "auxiliary_loss_mlp": 0.01117475, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00065041, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.2285890010341194, + "language_loss": 0.76247263, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78500795, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.6028146743774414 + }, + { + "auxiliary_loss_clip": 0.01168488, + "auxiliary_loss_mlp": 0.01117274, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00054383, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.7573177774209388, + "language_loss": 0.71236122, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73521882, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.5482406616210938 + }, + { + "auxiliary_loss_clip": 0.01135151, + "auxiliary_loss_mlp": 0.01118376, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00059724, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.8469372943247102, + "language_loss": 0.71491516, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73745048, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.651336431503296 + }, + { + "auxiliary_loss_clip": 0.01168424, + "auxiliary_loss_mlp": 0.01116552, + "balance_loss_clip": 1.00216663, + "balance_loss_mlp": 1.00058532, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.4199509262917422, + "language_loss": 0.83786845, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86071819, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.5737814903259277 + }, + { + "auxiliary_loss_clip": 0.01153049, + "auxiliary_loss_mlp": 0.01117489, + "balance_loss_clip": 1.00202191, + "balance_loss_mlp": 1.00056911, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.084397339132162, + "language_loss": 0.74687672, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76958215, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.538726568222046 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.01117226, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00040042, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.5521921119323916, + "language_loss": 0.80038083, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.82307386, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.5402512550354004 + }, + { + "auxiliary_loss_clip": 0.01147636, + "auxiliary_loss_mlp": 0.01097637, + "balance_loss_clip": 1.00158441, + "balance_loss_mlp": 0.99998045, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7487661052715668, + "language_loss": 0.6017049, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62415761, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.231856346130371 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01118154, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00066161, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 2.4313710984378343, + "language_loss": 0.69202983, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71459913, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.6398282051086426 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01117301, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00047636, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 2.2656822014121962, + "language_loss": 0.66546261, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68768787, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.755120277404785 + }, + { + "auxiliary_loss_clip": 0.01132668, + "auxiliary_loss_mlp": 0.01097847, + "balance_loss_clip": 1.00142956, + "balance_loss_mlp": 1.00019073, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8191621531209455, + "language_loss": 0.57991612, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60222125, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.0740294456481934 + }, + { + "auxiliary_loss_clip": 0.01123628, + "auxiliary_loss_mlp": 0.01117598, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00067782, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.602445559097639, + "language_loss": 0.7721495, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79456186, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.6546833515167236 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01117159, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.00061965, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.7925424317635101, + "language_loss": 0.7263782, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.748752, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.7326881885528564 + }, + { + "auxiliary_loss_clip": 0.01168406, + "auxiliary_loss_mlp": 0.01118037, + "balance_loss_clip": 1.00214112, + "balance_loss_mlp": 1.00063932, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.7790406073136835, + "language_loss": 0.75723755, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78010201, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 3.945985794067383 + }, + { + "auxiliary_loss_clip": 0.0112013, + "auxiliary_loss_mlp": 0.01118406, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.00072289, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.0958021603162287, + "language_loss": 0.74331236, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76569772, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.5985546112060547 + }, + { + "auxiliary_loss_clip": 0.0105343, + "auxiliary_loss_mlp": 0.01118214, + "balance_loss_clip": 1.0015986, + "balance_loss_mlp": 1.00062633, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.0150997760684266, + "language_loss": 0.79272509, + "learning_rate": 2.126684908394552e-06, + "loss": 0.8144415, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.799988031387329 + }, + { + "auxiliary_loss_clip": 0.01153541, + "auxiliary_loss_mlp": 0.01117064, + "balance_loss_clip": 1.00210273, + "balance_loss_mlp": 1.00090623, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.192665414662563, + "language_loss": 0.85632885, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87903494, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.5133566856384277 + }, + { + "auxiliary_loss_clip": 0.01103736, + "auxiliary_loss_mlp": 0.01116342, + "balance_loss_clip": 1.00184214, + "balance_loss_mlp": 1.00075626, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 2.277618162780691, + "language_loss": 0.77695155, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79915226, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.635220527648926 + }, + { + "auxiliary_loss_clip": 0.01134914, + "auxiliary_loss_mlp": 0.00747617, + "balance_loss_clip": 1.00189197, + "balance_loss_mlp": 1.00101089, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.5504340422018794, + "language_loss": 0.66988075, + "learning_rate": 2.125518848090833e-06, + "loss": 0.68870604, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.6416409015655518 + }, + { + "auxiliary_loss_clip": 0.01151841, + "auxiliary_loss_mlp": 0.01117335, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00079608, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 2.4263686231694304, + "language_loss": 0.68017095, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70286274, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 4.037514925003052 + }, + { + "auxiliary_loss_clip": 0.01139004, + "auxiliary_loss_mlp": 0.01117407, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00067759, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8539362039417848, + "language_loss": 0.75035095, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77291507, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.5713133811950684 + }, + { + "auxiliary_loss_clip": 0.01151488, + "auxiliary_loss_mlp": 0.01117323, + "balance_loss_clip": 1.00204539, + "balance_loss_mlp": 1.00068903, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.8405908623354237, + "language_loss": 0.81304181, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83572996, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.5228474140167236 + }, + { + "auxiliary_loss_clip": 0.01122229, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.0006249, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.5503298402675025, + "language_loss": 0.83693022, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85933459, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 4.061174154281616 + }, + { + "auxiliary_loss_clip": 0.01119776, + "auxiliary_loss_mlp": 0.01116936, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00049222, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.112430753621014, + "language_loss": 0.83889276, + "learning_rate": 2.123575319254087e-06, + "loss": 0.86125988, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.6500728130340576 + }, + { + "auxiliary_loss_clip": 0.0115194, + "auxiliary_loss_mlp": 0.01117353, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.0005275, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 1.767004900385977, + "language_loss": 0.7330929, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75578582, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 3.987553834915161 + }, + { + "auxiliary_loss_clip": 0.01136563, + "auxiliary_loss_mlp": 0.01118205, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00080752, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.6285064179580504, + "language_loss": 0.75725824, + "learning_rate": 2.122797874814289e-06, + "loss": 0.77980596, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.5753707885742188 + }, + { + "auxiliary_loss_clip": 0.01168524, + "auxiliary_loss_mlp": 0.01118437, + "balance_loss_clip": 1.00216746, + "balance_loss_mlp": 1.000754, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.5474541870803196, + "language_loss": 0.69837129, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72124094, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.5460755825042725 + }, + { + "auxiliary_loss_clip": 0.01105173, + "auxiliary_loss_mlp": 0.00747684, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00099349, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.9337666413982213, + "language_loss": 0.80192333, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82045192, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.6639938354492188 + }, + { + "auxiliary_loss_clip": 0.01168524, + "auxiliary_loss_mlp": 0.01116763, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00060594, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.634087101477089, + "language_loss": 0.81112087, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83397377, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.4746816158294678 + }, + { + "auxiliary_loss_clip": 0.01122282, + "auxiliary_loss_mlp": 0.01115902, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00050747, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.3188465100095172, + "language_loss": 0.67212772, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69450963, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.742844820022583 + }, + { + "auxiliary_loss_clip": 0.01120041, + "auxiliary_loss_mlp": 0.01118413, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00063443, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5953076576028873, + "language_loss": 0.73432177, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.75670636, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.6501142978668213 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.01116499, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00053179, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.7502826978445234, + "language_loss": 0.81647074, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83900166, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.540897846221924 + }, + { + "auxiliary_loss_clip": 0.01134942, + "auxiliary_loss_mlp": 0.01116412, + "balance_loss_clip": 1.00172448, + "balance_loss_mlp": 1.00044537, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.6814777250543147, + "language_loss": 0.80988842, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83240193, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.6125872135162354 + }, + { + "auxiliary_loss_clip": 0.01168582, + "auxiliary_loss_mlp": 0.01118601, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00072634, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.9414924323868288, + "language_loss": 0.66087073, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68374252, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.5128285884857178 + }, + { + "auxiliary_loss_clip": 0.01150972, + "auxiliary_loss_mlp": 0.01115429, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00051093, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.414215934681695, + "language_loss": 0.77664852, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79931253, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.5782670974731445 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00055194, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.693956922947794, + "language_loss": 0.78449821, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80704361, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.6189651489257812 + }, + { + "auxiliary_loss_clip": 0.01138517, + "auxiliary_loss_mlp": 0.01116935, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00058651, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 3.2754066168073708, + "language_loss": 0.7647934, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78734791, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.5562589168548584 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01115578, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00075603, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.7103253300079426, + "language_loss": 0.89686835, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91905737, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.6935653686523438 + }, + { + "auxiliary_loss_clip": 0.01089412, + "auxiliary_loss_mlp": 0.0111672, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00065804, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.3776345489056967, + "language_loss": 0.73729253, + "learning_rate": 2.11774403721606e-06, + "loss": 0.75935388, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.7259604930877686 + }, + { + "auxiliary_loss_clip": 0.01103669, + "auxiliary_loss_mlp": 0.01118109, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00080752, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 4.09787362518411, + "language_loss": 0.69308877, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71530658, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.6457972526550293 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.0111757, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.00045872, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.3942204225822117, + "language_loss": 0.65277952, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67530638, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.606196880340576 + }, + { + "auxiliary_loss_clip": 0.01132997, + "auxiliary_loss_mlp": 0.010973, + "balance_loss_clip": 1.00154042, + "balance_loss_mlp": 1.00002563, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.886987301849866, + "language_loss": 0.53476483, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55706781, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.2154622077941895 + }, + { + "auxiliary_loss_clip": 0.01151696, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00054979, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.6588407685266646, + "language_loss": 0.79418564, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81686676, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.588782548904419 + }, + { + "auxiliary_loss_clip": 0.01135452, + "auxiliary_loss_mlp": 0.01116673, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00061095, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.794870654523173, + "language_loss": 0.74668634, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.7692076, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.650156021118164 + }, + { + "auxiliary_loss_clip": 0.01153355, + "auxiliary_loss_mlp": 0.0074788, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00108695, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.3811150970715456, + "language_loss": 0.67911416, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69812649, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.7675609588623047 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01116436, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00075555, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.8614268427112715, + "language_loss": 0.85438323, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87690842, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.608085870742798 + }, + { + "auxiliary_loss_clip": 0.01104836, + "auxiliary_loss_mlp": 0.00747685, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00102627, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.5780099258855809, + "language_loss": 0.71160269, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73012793, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.693469524383545 + }, + { + "auxiliary_loss_clip": 0.01153833, + "auxiliary_loss_mlp": 0.01117001, + "balance_loss_clip": 1.00225186, + "balance_loss_mlp": 1.00055695, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.370152350046504, + "language_loss": 0.78398281, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80669117, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.580749034881592 + }, + { + "auxiliary_loss_clip": 0.01120405, + "auxiliary_loss_mlp": 0.01117648, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00082254, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.281223665216013, + "language_loss": 0.66166478, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68404531, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.7596065998077393 + }, + { + "auxiliary_loss_clip": 0.01121837, + "auxiliary_loss_mlp": 0.01116192, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00079763, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.646526095472255, + "language_loss": 0.78124207, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80362236, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.629814386367798 + }, + { + "auxiliary_loss_clip": 0.01120019, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00048006, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.6385679062394167, + "language_loss": 0.75483298, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77720726, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.6967477798461914 + }, + { + "auxiliary_loss_clip": 0.01151668, + "auxiliary_loss_mlp": 0.01118638, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00066805, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.8078189073499475, + "language_loss": 0.83541477, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85811782, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.5739247798919678 + }, + { + "auxiliary_loss_clip": 0.01168219, + "auxiliary_loss_mlp": 0.00747729, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00099206, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3336008760362095, + "language_loss": 0.70047706, + "learning_rate": 2.112300599949172e-06, + "loss": 0.71963656, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 3.978389024734497 + }, + { + "auxiliary_loss_clip": 0.0115155, + "auxiliary_loss_mlp": 0.01116465, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00059342, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.8580813932589024, + "language_loss": 0.82359576, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84627593, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.5450589656829834 + }, + { + "auxiliary_loss_clip": 0.01151954, + "auxiliary_loss_mlp": 0.01117583, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00056684, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 1.904443434199705, + "language_loss": 0.6794157, + "learning_rate": 2.111522896975052e-06, + "loss": 0.70211107, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.5224263668060303 + }, + { + "auxiliary_loss_clip": 0.01153243, + "auxiliary_loss_mlp": 0.0111769, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00057876, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 1.9590718198246775, + "language_loss": 0.70952249, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73223174, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.5227913856506348 + }, + { + "auxiliary_loss_clip": 0.01119906, + "auxiliary_loss_mlp": 0.01117402, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00067258, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6419558087112436, + "language_loss": 0.64718544, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66955853, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.682021379470825 + }, + { + "auxiliary_loss_clip": 0.01153486, + "auxiliary_loss_mlp": 0.01117928, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00062633, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.393341598576148, + "language_loss": 0.73315609, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75587022, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 3.9218411445617676 + }, + { + "auxiliary_loss_clip": 0.01135111, + "auxiliary_loss_mlp": 0.0111706, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00071192, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.4118409552285245, + "language_loss": 0.73334432, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75586605, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.6372768878936768 + }, + { + "auxiliary_loss_clip": 0.01104821, + "auxiliary_loss_mlp": 0.01116908, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.0006547, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.527307653166705, + "language_loss": 0.78642952, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8086468, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.720944881439209 + }, + { + "auxiliary_loss_clip": 0.01135159, + "auxiliary_loss_mlp": 0.01117848, + "balance_loss_clip": 1.0017817, + "balance_loss_mlp": 1.00073647, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.643741692962619, + "language_loss": 0.73554105, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75807112, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 3.9838337898254395 + }, + { + "auxiliary_loss_clip": 0.01153512, + "auxiliary_loss_mlp": 0.01117254, + "balance_loss_clip": 1.00233102, + "balance_loss_mlp": 1.0007149, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.9097116897488906, + "language_loss": 0.74276626, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76547384, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.608795642852783 + }, + { + "auxiliary_loss_clip": 0.0113498, + "auxiliary_loss_mlp": 0.01117276, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00064206, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7195195519376665, + "language_loss": 0.85798764, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.88051021, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 4.098179340362549 + }, + { + "auxiliary_loss_clip": 0.01105134, + "auxiliary_loss_mlp": 0.01117553, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00044203, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 2.168662069087068, + "language_loss": 0.72339582, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74562263, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.750584602355957 + }, + { + "auxiliary_loss_clip": 0.01136567, + "auxiliary_loss_mlp": 0.01117699, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00058818, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.6936280171928404, + "language_loss": 0.80536985, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82791245, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.5822222232818604 + }, + { + "auxiliary_loss_clip": 0.01153355, + "auxiliary_loss_mlp": 0.01117403, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00057828, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.040801113772194, + "language_loss": 0.73400664, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75671422, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.5433926582336426 + }, + { + "auxiliary_loss_clip": 0.01152116, + "auxiliary_loss_mlp": 0.01118533, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00075412, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.6281238439623396, + "language_loss": 0.84145892, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86416543, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.5746114253997803 + }, + { + "auxiliary_loss_clip": 0.01138558, + "auxiliary_loss_mlp": 0.01118328, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00083494, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.5141386556980914, + "language_loss": 0.66612172, + "learning_rate": 2.106467420591409e-06, + "loss": 0.68869054, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.5943543910980225 + }, + { + "auxiliary_loss_clip": 0.0116837, + "auxiliary_loss_mlp": 0.01118031, + "balance_loss_clip": 1.0021044, + "balance_loss_mlp": 1.00063336, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.652537332467637, + "language_loss": 0.67022598, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69309002, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.496572971343994 + }, + { + "auxiliary_loss_clip": 0.0115352, + "auxiliary_loss_mlp": 0.01117888, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00058651, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 3.280660662091084, + "language_loss": 0.82018977, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84290379, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.5665123462677 + }, + { + "auxiliary_loss_clip": 0.01153464, + "auxiliary_loss_mlp": 0.01117525, + "balance_loss_clip": 1.0021162, + "balance_loss_mlp": 1.00060487, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 49.12084407618969, + "language_loss": 0.72541821, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.74812812, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.560729742050171 + }, + { + "auxiliary_loss_clip": 0.01087749, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_clip": 1.00175321, + "balance_loss_mlp": 1.00054502, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.7568496987339999, + "language_loss": 0.67547971, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69752997, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.6891567707061768 + }, + { + "auxiliary_loss_clip": 0.01137705, + "auxiliary_loss_mlp": 0.0111686, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00051165, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.7901144953265273, + "language_loss": 0.64793158, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67047715, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.670124053955078 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00062776, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.8316068263566125, + "language_loss": 0.69663203, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71881974, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.664762258529663 + }, + { + "auxiliary_loss_clip": 0.01168329, + "auxiliary_loss_mlp": 0.01116867, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00061381, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 1.697697758680474, + "language_loss": 0.84768516, + "learning_rate": 2.103744956327814e-06, + "loss": 0.8705371, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.5094966888427734 + }, + { + "auxiliary_loss_clip": 0.01122184, + "auxiliary_loss_mlp": 0.01117772, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.0006609, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 4.6127137791852055, + "language_loss": 0.68967599, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71207559, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.658444881439209 + }, + { + "auxiliary_loss_clip": 0.01133345, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_clip": 1.0016017, + "balance_loss_mlp": 1.00003302, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7663607139661414, + "language_loss": 0.51100373, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53331017, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.254417657852173 + }, + { + "auxiliary_loss_clip": 0.01136019, + "auxiliary_loss_mlp": 0.0111736, + "balance_loss_clip": 1.00199664, + "balance_loss_mlp": 1.00082088, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.6863485245197607, + "language_loss": 0.84490472, + "learning_rate": 2.102578126623879e-06, + "loss": 0.8674385, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.57136607170105 + }, + { + "auxiliary_loss_clip": 0.01151509, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_clip": 1.00212884, + "balance_loss_mlp": 1.00063634, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 2.408311064930984, + "language_loss": 0.69209433, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71477449, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.5365843772888184 + }, + { + "auxiliary_loss_clip": 0.01168368, + "auxiliary_loss_mlp": 0.01117782, + "balance_loss_clip": 1.00201857, + "balance_loss_mlp": 1.00048018, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.6453360551718261, + "language_loss": 0.72627467, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74913615, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.5934324264526367 + }, + { + "auxiliary_loss_clip": 0.01151735, + "auxiliary_loss_mlp": 0.01117243, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.00060844, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 5.323405026125673, + "language_loss": 0.80809796, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83078778, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.598564863204956 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01097253, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 0.99997807, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7076006205660311, + "language_loss": 0.56854039, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59069186, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.3169872760772705 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01117203, + "balance_loss_clip": 1.00231767, + "balance_loss_mlp": 1.00056911, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.7727402245080952, + "language_loss": 0.82597142, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.8488282, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.4916927814483643 + }, + { + "auxiliary_loss_clip": 0.01168351, + "auxiliary_loss_mlp": 0.01117424, + "balance_loss_clip": 1.00212455, + "balance_loss_mlp": 1.00040781, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.8651758395021745, + "language_loss": 0.6116609, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63451862, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.564033031463623 + }, + { + "auxiliary_loss_clip": 0.01168231, + "auxiliary_loss_mlp": 0.01116518, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.00064707, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5464900740750553, + "language_loss": 0.74985933, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77270681, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.5441064834594727 + }, + { + "auxiliary_loss_clip": 0.01136342, + "auxiliary_loss_mlp": 0.01117389, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00065947, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 1.8281323942102696, + "language_loss": 0.79714841, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.81968576, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.5895557403564453 + }, + { + "auxiliary_loss_clip": 0.01153529, + "auxiliary_loss_mlp": 0.01117613, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00069284, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 3.0724870106190534, + "language_loss": 0.70832884, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73104024, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01135137, + "auxiliary_loss_mlp": 0.01117332, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00060248, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.7966885748816488, + "language_loss": 0.77493882, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79746354, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.559652090072632 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01117021, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00067306, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.8446255939660514, + "language_loss": 0.84751999, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86990434, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.6637790203094482 + }, + { + "auxiliary_loss_clip": 0.01136498, + "auxiliary_loss_mlp": 0.01117012, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00056887, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.0854491907528048, + "language_loss": 0.80901575, + "learning_rate": 2.097910461710939e-06, + "loss": 0.8315509, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 3.992582082748413 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.00747804, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00102651, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.7812082798556022, + "language_loss": 0.79373294, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81240433, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.655578374862671 + }, + { + "auxiliary_loss_clip": 0.01168332, + "auxiliary_loss_mlp": 0.0111656, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00068867, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.5032527782702945, + "language_loss": 0.74519563, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76804453, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.74857497215271 + }, + { + "auxiliary_loss_clip": 0.01151871, + "auxiliary_loss_mlp": 0.01117192, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00055778, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.5238318238375974, + "language_loss": 0.81019235, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.832883, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.6006274223327637 + }, + { + "auxiliary_loss_clip": 0.0113438, + "auxiliary_loss_mlp": 0.0111734, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.8342065564548868, + "language_loss": 0.83245426, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85497147, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.575890064239502 + }, + { + "auxiliary_loss_clip": 0.01151511, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_clip": 1.00203562, + "balance_loss_mlp": 1.00058794, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.710057698517641, + "language_loss": 0.82025492, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.84292793, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.539379596710205 + }, + { + "auxiliary_loss_clip": 0.01121712, + "auxiliary_loss_mlp": 0.01116106, + "balance_loss_clip": 1.00198972, + "balance_loss_mlp": 1.0006156, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.5535027969172277, + "language_loss": 0.7138952, + "learning_rate": 2.095576427171635e-06, + "loss": 0.73627341, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 4.109277725219727 + }, + { + "auxiliary_loss_clip": 0.01120403, + "auxiliary_loss_mlp": 0.01119443, + "balance_loss_clip": 1.0021019, + "balance_loss_mlp": 1.00090122, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 2.380272334667337, + "language_loss": 0.76867872, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79107714, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.5857369899749756 + }, + { + "auxiliary_loss_clip": 0.01151389, + "auxiliary_loss_mlp": 0.0074777, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00102377, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.6965674630171057, + "language_loss": 0.82787871, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.8468703, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.535710573196411 + }, + { + "auxiliary_loss_clip": 0.01153625, + "auxiliary_loss_mlp": 0.01117704, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00068772, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.178389479472288, + "language_loss": 0.73241776, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75513107, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 3.9012033939361572 + }, + { + "auxiliary_loss_clip": 0.01118189, + "auxiliary_loss_mlp": 0.0111725, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00061524, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.9960419241739014, + "language_loss": 0.69435525, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71670961, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.701122760772705 + }, + { + "auxiliary_loss_clip": 0.01151776, + "auxiliary_loss_mlp": 0.00747891, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00113332, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9369340280810228, + "language_loss": 0.72207391, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74107057, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 3.989647388458252 + }, + { + "auxiliary_loss_clip": 0.01121705, + "auxiliary_loss_mlp": 0.01116975, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00072193, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.6495317108178515, + "language_loss": 0.73646581, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7588526, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.663695812225342 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.01116219, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00082457, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5531324203505446, + "language_loss": 0.77757925, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80010867, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.589416265487671 + }, + { + "auxiliary_loss_clip": 0.01168433, + "auxiliary_loss_mlp": 0.01117235, + "balance_loss_clip": 1.00220263, + "balance_loss_mlp": 1.00069571, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.3128108780437113, + "language_loss": 0.87804854, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90090513, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.493457078933716 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01117313, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00058293, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.118713291810029, + "language_loss": 0.74232566, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76469624, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.633479595184326 + }, + { + "auxiliary_loss_clip": 0.01168355, + "auxiliary_loss_mlp": 0.01116996, + "balance_loss_clip": 1.00217426, + "balance_loss_mlp": 1.00074291, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 1.5611698880136724, + "language_loss": 0.79666793, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81952143, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.6846165657043457 + }, + { + "auxiliary_loss_clip": 0.01131048, + "auxiliary_loss_mlp": 0.00746068, + "balance_loss_clip": 1.00142241, + "balance_loss_mlp": 1.00049615, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7297254987211399, + "language_loss": 0.56096911, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.57974029, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 3.0224056243896484 + }, + { + "auxiliary_loss_clip": 0.01151655, + "auxiliary_loss_mlp": 0.01116777, + "balance_loss_clip": 1.00218093, + "balance_loss_mlp": 1.00061953, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 1.8630504622050352, + "language_loss": 0.65417302, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67685729, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.6305646896362305 + }, + { + "auxiliary_loss_clip": 0.01168139, + "auxiliary_loss_mlp": 0.01116235, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00064993, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.6526687803930729, + "language_loss": 0.74863553, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77147925, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.5692436695098877 + }, + { + "auxiliary_loss_clip": 0.01168491, + "auxiliary_loss_mlp": 0.01117974, + "balance_loss_clip": 1.0021646, + "balance_loss_mlp": 1.00076747, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 1.7860483980683555, + "language_loss": 0.80432075, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82718539, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.5272202491760254 + }, + { + "auxiliary_loss_clip": 0.0114798, + "auxiliary_loss_mlp": 0.01097072, + "balance_loss_clip": 1.00150609, + "balance_loss_mlp": 1.00017917, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8906395456703846, + "language_loss": 0.62636662, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64881712, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.088292121887207 + }, + { + "auxiliary_loss_clip": 0.01152785, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00062609, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.5288222208205091, + "language_loss": 0.79905289, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.82174379, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.596958875656128 + }, + { + "auxiliary_loss_clip": 0.01121505, + "auxiliary_loss_mlp": 0.01116382, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00070119, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7344224017656469, + "language_loss": 0.80282629, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82520515, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.6221840381622314 + }, + { + "auxiliary_loss_clip": 0.01168443, + "auxiliary_loss_mlp": 0.01117115, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00067186, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.8164041287975374, + "language_loss": 0.79165077, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81450629, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.5076382160186768 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01117495, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00057471, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 5.935777057568543, + "language_loss": 0.85059392, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87311733, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.664935827255249 + }, + { + "auxiliary_loss_clip": 0.01151519, + "auxiliary_loss_mlp": 0.01116038, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00054801, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.363726357891071, + "language_loss": 0.7070601, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72973567, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.5917160511016846 + }, + { + "auxiliary_loss_clip": 0.01121237, + "auxiliary_loss_mlp": 0.01118458, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00058377, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.966507020505563, + "language_loss": 0.7825107, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80490762, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.7058827877044678 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01117245, + "balance_loss_clip": 1.00199473, + "balance_loss_mlp": 1.00061035, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 3.0439537649062194, + "language_loss": 0.89313877, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.9155041, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.5760796070098877 + }, + { + "auxiliary_loss_clip": 0.01135131, + "auxiliary_loss_mlp": 0.01116946, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00078845, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.7682144118665397, + "language_loss": 0.76758122, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.79010201, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.6148507595062256 + }, + { + "auxiliary_loss_clip": 0.01152916, + "auxiliary_loss_mlp": 0.01115787, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.00048757, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 3.457217337641737, + "language_loss": 0.67431289, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69699991, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.556668758392334 + }, + { + "auxiliary_loss_clip": 0.01136723, + "auxiliary_loss_mlp": 0.01116866, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00070882, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.7925174585475592, + "language_loss": 0.7534374, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77597326, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.6191020011901855 + }, + { + "auxiliary_loss_clip": 0.0115187, + "auxiliary_loss_mlp": 0.01117505, + "balance_loss_clip": 1.0020963, + "balance_loss_mlp": 1.00048947, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.9841014282617069, + "language_loss": 0.78410536, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80679911, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.535247564315796 + }, + { + "auxiliary_loss_clip": 0.01136192, + "auxiliary_loss_mlp": 0.00747785, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00083685, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 2.160361845143395, + "language_loss": 0.688568, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.70740783, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.581035614013672 + }, + { + "auxiliary_loss_clip": 0.01121587, + "auxiliary_loss_mlp": 0.01117074, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00082111, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 3.02895687539309, + "language_loss": 0.71369898, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73608559, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.620727300643921 + }, + { + "auxiliary_loss_clip": 0.01151414, + "auxiliary_loss_mlp": 0.01116133, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00054765, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.417349805013063, + "language_loss": 0.74132657, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76400203, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.5633718967437744 + }, + { + "auxiliary_loss_clip": 0.01151614, + "auxiliary_loss_mlp": 0.01116245, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00056446, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 1.9342346203402345, + "language_loss": 0.64172924, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66440785, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.5051355361938477 + }, + { + "auxiliary_loss_clip": 0.01116257, + "auxiliary_loss_mlp": 0.01096944, + "balance_loss_clip": 1.00143504, + "balance_loss_mlp": 1.00005102, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7823213961163384, + "language_loss": 0.59817535, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.62030739, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.3210082054138184 + }, + { + "auxiliary_loss_clip": 0.01138333, + "auxiliary_loss_mlp": 0.01116934, + "balance_loss_clip": 1.00218093, + "balance_loss_mlp": 1.00077617, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.0960712783226025, + "language_loss": 0.75207287, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77462554, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 4.055853366851807 + }, + { + "auxiliary_loss_clip": 0.01138767, + "auxiliary_loss_mlp": 0.01117192, + "balance_loss_clip": 1.00239682, + "balance_loss_mlp": 1.00055742, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6395037022313304, + "language_loss": 0.72277772, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74533731, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.610144853591919 + }, + { + "auxiliary_loss_clip": 0.01153666, + "auxiliary_loss_mlp": 0.01118207, + "balance_loss_clip": 1.00224805, + "balance_loss_mlp": 1.00071406, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.594851335103479, + "language_loss": 0.73604625, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75876504, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.582280158996582 + }, + { + "auxiliary_loss_clip": 0.01136714, + "auxiliary_loss_mlp": 0.01116958, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00070453, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.4702168913423599, + "language_loss": 0.72226727, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74480397, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.653062105178833 + }, + { + "auxiliary_loss_clip": 0.01151818, + "auxiliary_loss_mlp": 0.01117755, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00073957, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.6125290140034056, + "language_loss": 0.81534803, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83804375, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.581190347671509 + }, + { + "auxiliary_loss_clip": 0.01151935, + "auxiliary_loss_mlp": 0.0111854, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00076175, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 1.9653577965534423, + "language_loss": 0.76359046, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78629524, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.532984972000122 + }, + { + "auxiliary_loss_clip": 0.01151559, + "auxiliary_loss_mlp": 0.011168, + "balance_loss_clip": 1.00201523, + "balance_loss_mlp": 1.00054729, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6352447201185252, + "language_loss": 0.75884771, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78153127, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 3.9541358947753906 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01117453, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.0009141, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.8549029002160595, + "language_loss": 0.72683728, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74937725, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.6196751594543457 + }, + { + "auxiliary_loss_clip": 0.01136942, + "auxiliary_loss_mlp": 0.01116491, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00071466, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.7367752405007135, + "language_loss": 0.77064699, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79318136, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 3.9597136974334717 + }, + { + "auxiliary_loss_clip": 0.01120495, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_clip": 1.0022006, + "balance_loss_mlp": 1.00077689, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.499237840431912, + "language_loss": 0.76548034, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78785747, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.656984806060791 + }, + { + "auxiliary_loss_clip": 0.01121131, + "auxiliary_loss_mlp": 0.01118649, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00058377, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.8166969255005003, + "language_loss": 0.85191375, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87431157, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.6965460777282715 + }, + { + "auxiliary_loss_clip": 0.01134666, + "auxiliary_loss_mlp": 0.01117689, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00057817, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 2.0129873489323766, + "language_loss": 0.78583312, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80835664, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.6525299549102783 + }, + { + "auxiliary_loss_clip": 0.01151381, + "auxiliary_loss_mlp": 0.01116485, + "balance_loss_clip": 1.00194824, + "balance_loss_mlp": 1.00051796, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.3655332386426786, + "language_loss": 0.75244892, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77512753, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 3.959580898284912 + }, + { + "auxiliary_loss_clip": 0.01168234, + "auxiliary_loss_mlp": 0.01116576, + "balance_loss_clip": 1.00210321, + "balance_loss_mlp": 1.0006094, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5614052424845875, + "language_loss": 0.69637227, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7192204, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.503600597381592 + }, + { + "auxiliary_loss_clip": 0.01134497, + "auxiliary_loss_mlp": 0.0111805, + "balance_loss_clip": 1.00200891, + "balance_loss_mlp": 1.00065255, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.5608573309962197, + "language_loss": 0.73141181, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75393724, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.604299545288086 + }, + { + "auxiliary_loss_clip": 0.01151754, + "auxiliary_loss_mlp": 0.01116592, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.0006249, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 6.670080002440827, + "language_loss": 0.78115577, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80383927, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.608027935028076 + }, + { + "auxiliary_loss_clip": 0.01153222, + "auxiliary_loss_mlp": 0.01115798, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00059378, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.6870108682363374, + "language_loss": 0.69823611, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72092628, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.5505237579345703 + }, + { + "auxiliary_loss_clip": 0.01149473, + "auxiliary_loss_mlp": 0.01096861, + "balance_loss_clip": 1.00143158, + "balance_loss_mlp": 0.9999674, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8793682516882001, + "language_loss": 0.6336326, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65609598, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.0653796195983887 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.01115632, + "balance_loss_clip": 1.00200117, + "balance_loss_mlp": 1.00052381, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 1.7857841535271977, + "language_loss": 0.60397989, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62665403, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.642162799835205 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.01116717, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00065494, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5713276448590006, + "language_loss": 0.682675, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70489419, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.781064033508301 + }, + { + "auxiliary_loss_clip": 0.01136591, + "auxiliary_loss_mlp": 0.01117366, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00063646, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.7382369511215547, + "language_loss": 0.67665958, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.6991992, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.7248411178588867 + }, + { + "auxiliary_loss_clip": 0.01118988, + "auxiliary_loss_mlp": 0.01116318, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00063741, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 3.0457438917610893, + "language_loss": 0.66366893, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68602204, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.6707863807678223 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.000664, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.7079795685850931, + "language_loss": 0.74420804, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76671582, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.612182378768921 + }, + { + "auxiliary_loss_clip": 0.01136258, + "auxiliary_loss_mlp": 0.01116295, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.00061452, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.525315071570717, + "language_loss": 0.6851846, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70771015, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.662966728210449 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01118101, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00060844, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.6253265892391209, + "language_loss": 0.78937835, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81160688, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.6784656047821045 + }, + { + "auxiliary_loss_clip": 0.01151421, + "auxiliary_loss_mlp": 0.00747868, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00096929, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.9986959064764256, + "language_loss": 0.59944218, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61843503, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.6255171298980713 + }, + { + "auxiliary_loss_clip": 0.01134626, + "auxiliary_loss_mlp": 0.01116617, + "balance_loss_clip": 1.00187206, + "balance_loss_mlp": 1.00055516, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 2.184016097924252, + "language_loss": 0.76281738, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78532988, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.5887439250946045 + }, + { + "auxiliary_loss_clip": 0.01118865, + "auxiliary_loss_mlp": 0.01117313, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00067854, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.6370407949406223, + "language_loss": 0.74960554, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77196729, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.6470296382904053 + }, + { + "auxiliary_loss_clip": 0.01151383, + "auxiliary_loss_mlp": 0.01116745, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00068307, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 1.9535732827907843, + "language_loss": 0.66580302, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68848431, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.596039295196533 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01115279, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00055218, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.951250975398461, + "language_loss": 0.63204873, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65423262, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.736751079559326 + }, + { + "auxiliary_loss_clip": 0.01137801, + "auxiliary_loss_mlp": 0.01116167, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00067711, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.3425569164573599, + "language_loss": 0.67501152, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69755113, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.6172149181365967 + }, + { + "auxiliary_loss_clip": 0.01137848, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.0007689, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 1.8756366133221412, + "language_loss": 0.62756062, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.65010929, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.552699565887451 + }, + { + "auxiliary_loss_clip": 0.01121694, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.00045884, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.9490679780329618, + "language_loss": 0.67102617, + "learning_rate": 2.070672579324465e-06, + "loss": 0.6934036, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.61848783493042 + }, + { + "auxiliary_loss_clip": 0.01151765, + "auxiliary_loss_mlp": 0.01116628, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00075626, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6508981825632971, + "language_loss": 0.71271336, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73539722, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.629904270172119 + }, + { + "auxiliary_loss_clip": 0.01152773, + "auxiliary_loss_mlp": 0.01116012, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.0005219, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.8090384837361295, + "language_loss": 0.82753861, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85022652, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.5489139556884766 + }, + { + "auxiliary_loss_clip": 0.01151801, + "auxiliary_loss_mlp": 0.01116863, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00060964, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.4178569449654683, + "language_loss": 0.66620225, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68888891, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.6541197299957275 + }, + { + "auxiliary_loss_clip": 0.01088994, + "auxiliary_loss_mlp": 0.01114841, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00049567, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3228240203151969, + "language_loss": 0.80172104, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82375944, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.708773612976074 + }, + { + "auxiliary_loss_clip": 0.0115123, + "auxiliary_loss_mlp": 0.01115773, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00056887, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.0619751418112493, + "language_loss": 0.69987595, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72254598, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.6173036098480225 + }, + { + "auxiliary_loss_clip": 0.01137154, + "auxiliary_loss_mlp": 0.01116263, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00058222, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.4963342807749445, + "language_loss": 0.69309372, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71562791, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 4.183476448059082 + }, + { + "auxiliary_loss_clip": 0.01133311, + "auxiliary_loss_mlp": 0.0109647, + "balance_loss_clip": 1.00165987, + "balance_loss_mlp": 1.00033975, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8432107105850486, + "language_loss": 0.52987254, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55217028, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 2.962231397628784 + }, + { + "auxiliary_loss_clip": 0.01116709, + "auxiliary_loss_mlp": 0.01096357, + "balance_loss_clip": 1.00152206, + "balance_loss_mlp": 1.00022697, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.865355941936817, + "language_loss": 0.60761404, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62974471, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 3.0156195163726807 + }, + { + "auxiliary_loss_clip": 0.0112154, + "auxiliary_loss_mlp": 0.0111524, + "balance_loss_clip": 1.00188696, + "balance_loss_mlp": 1.00060868, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.5238858148996945, + "language_loss": 0.84325325, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86562109, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.627627372741699 + }, + { + "auxiliary_loss_clip": 0.01120797, + "auxiliary_loss_mlp": 0.01115569, + "balance_loss_clip": 1.0018326, + "balance_loss_mlp": 1.00055623, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.083902454485777, + "language_loss": 0.50894707, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53131068, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 2.6773123741149902 + }, + { + "auxiliary_loss_clip": 0.01168268, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00059402, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.4574536745155535, + "language_loss": 0.753124, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77597415, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 3.9389777183532715 + }, + { + "auxiliary_loss_clip": 0.01153561, + "auxiliary_loss_mlp": 0.01115937, + "balance_loss_clip": 1.00213158, + "balance_loss_mlp": 1.0005424, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.045448841926676, + "language_loss": 0.67696863, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69966364, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.513115167617798 + }, + { + "auxiliary_loss_clip": 0.01151795, + "auxiliary_loss_mlp": 0.01115894, + "balance_loss_clip": 1.00230289, + "balance_loss_mlp": 1.00049901, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 2.0739090422574273, + "language_loss": 0.78609788, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80877483, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.6058473587036133 + }, + { + "auxiliary_loss_clip": 0.01105569, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00044131, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.566243283778676, + "language_loss": 0.66641539, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68862557, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 4.1999499797821045 + }, + { + "auxiliary_loss_clip": 0.01151508, + "auxiliary_loss_mlp": 0.0074786, + "balance_loss_clip": 1.00212884, + "balance_loss_mlp": 1.00103164, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.5278607097416554, + "language_loss": 0.72008109, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73907483, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.6131670475006104 + }, + { + "auxiliary_loss_clip": 0.01136768, + "auxiliary_loss_mlp": 0.01116084, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00078464, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.9281808549007295, + "language_loss": 0.81142199, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83395052, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.592562198638916 + }, + { + "auxiliary_loss_clip": 0.01119416, + "auxiliary_loss_mlp": 0.01115888, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00058913, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 2.107950265376716, + "language_loss": 0.78460234, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80695534, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 4.019606828689575 + }, + { + "auxiliary_loss_clip": 0.01168317, + "auxiliary_loss_mlp": 0.00747821, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00104308, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.65160122462996, + "language_loss": 0.70007497, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71923631, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.5962777137756348 + }, + { + "auxiliary_loss_clip": 0.01152717, + "auxiliary_loss_mlp": 0.00747921, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00113738, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 3.6365624453625243, + "language_loss": 0.6932596, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71226597, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.587043046951294 + }, + { + "auxiliary_loss_clip": 0.01153082, + "auxiliary_loss_mlp": 0.0111593, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00072587, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.5359864691637835, + "language_loss": 0.85743797, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88012815, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.5918190479278564 + }, + { + "auxiliary_loss_clip": 0.01117823, + "auxiliary_loss_mlp": 0.00747731, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00100899, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.7789110186978485, + "language_loss": 0.75453287, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77318841, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.6391522884368896 + }, + { + "auxiliary_loss_clip": 0.01168239, + "auxiliary_loss_mlp": 0.01116826, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00047779, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.940724451948494, + "language_loss": 0.72828263, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75113326, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 2.64430832862854 + }, + { + "auxiliary_loss_clip": 0.01119026, + "auxiliary_loss_mlp": 0.01114677, + "balance_loss_clip": 1.00160921, + "balance_loss_mlp": 1.00052214, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.766708567471637, + "language_loss": 0.76470286, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.78703988, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.6462111473083496 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01116217, + "balance_loss_clip": 1.00194776, + "balance_loss_mlp": 1.00053644, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.567264285580355, + "language_loss": 0.63160825, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65399504, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.697530746459961 + }, + { + "auxiliary_loss_clip": 0.0113757, + "auxiliary_loss_mlp": 0.01117024, + "balance_loss_clip": 1.00206709, + "balance_loss_mlp": 1.00048518, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.7506631850984191, + "language_loss": 0.64094281, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.66348875, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.5839896202087402 + }, + { + "auxiliary_loss_clip": 0.01135877, + "auxiliary_loss_mlp": 0.01115354, + "balance_loss_clip": 1.0019902, + "balance_loss_mlp": 1.00053155, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3431108815354151, + "language_loss": 0.71068418, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73319644, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.6516339778900146 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01117008, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00075543, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.4384960759503862, + "language_loss": 0.79455388, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81709033, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.603954553604126 + }, + { + "auxiliary_loss_clip": 0.01168311, + "auxiliary_loss_mlp": 0.01116821, + "balance_loss_clip": 1.00211048, + "balance_loss_mlp": 1.00056791, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.6639276450734282, + "language_loss": 0.81527209, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83812344, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.50762677192688 + }, + { + "auxiliary_loss_clip": 0.01137781, + "auxiliary_loss_mlp": 0.01116259, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00067401, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 6.358584156380627, + "language_loss": 0.80610693, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82864738, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.5818567276000977 + }, + { + "auxiliary_loss_clip": 0.0111801, + "auxiliary_loss_mlp": 0.00747925, + "balance_loss_clip": 1.0017916, + "balance_loss_mlp": 1.00115061, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.8670575414172605, + "language_loss": 0.80257213, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82123148, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.610935926437378 + }, + { + "auxiliary_loss_clip": 0.01153279, + "auxiliary_loss_mlp": 0.01116296, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00051951, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1815933098761984, + "language_loss": 0.61943078, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64212644, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.6479098796844482 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01116266, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00058556, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.498345937213193, + "language_loss": 0.81544948, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.83780301, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.6201670169830322 + }, + { + "auxiliary_loss_clip": 0.01118224, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00069427, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.515556496250661, + "language_loss": 0.79263556, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81498057, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.6760060787200928 + }, + { + "auxiliary_loss_clip": 0.0110293, + "auxiliary_loss_mlp": 0.01114958, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.00061274, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.9845143400882643, + "language_loss": 0.62619776, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64837664, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.672788619995117 + }, + { + "auxiliary_loss_clip": 0.01119683, + "auxiliary_loss_mlp": 0.01116431, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00046444, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.4274514061367225, + "language_loss": 0.77459311, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79695421, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.6200764179229736 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.01116746, + "balance_loss_clip": 1.00218737, + "balance_loss_mlp": 1.00049353, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.8763717189671987, + "language_loss": 0.77045298, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79249924, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.7248644828796387 + }, + { + "auxiliary_loss_clip": 0.01168193, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00056052, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.6415637161670062, + "language_loss": 0.7761147, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79896474, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.515328884124756 + }, + { + "auxiliary_loss_clip": 0.01152971, + "auxiliary_loss_mlp": 0.0111629, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00051367, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4298571689520343, + "language_loss": 0.66883361, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69152623, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.5788257122039795 + }, + { + "auxiliary_loss_clip": 0.0116819, + "auxiliary_loss_mlp": 0.01117004, + "balance_loss_clip": 1.00207412, + "balance_loss_mlp": 1.00065625, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.6222663707503788, + "language_loss": 0.81820744, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84105933, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.5861434936523438 + }, + { + "auxiliary_loss_clip": 0.01168267, + "auxiliary_loss_mlp": 0.00748026, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00111246, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.6178636147035266, + "language_loss": 0.74577051, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76493347, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.5475430488586426 + }, + { + "auxiliary_loss_clip": 0.01168111, + "auxiliary_loss_mlp": 0.01116794, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00073183, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.4703523699816405, + "language_loss": 0.715734, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73858309, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.6657564640045166 + }, + { + "auxiliary_loss_clip": 0.01119366, + "auxiliary_loss_mlp": 0.01117421, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00069165, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.7294137036654906, + "language_loss": 0.78638399, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80875182, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.6345407962799072 + }, + { + "auxiliary_loss_clip": 0.01151459, + "auxiliary_loss_mlp": 0.01116914, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00066113, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.023044082232747, + "language_loss": 0.77995437, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80263805, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.5519371032714844 + }, + { + "auxiliary_loss_clip": 0.01168184, + "auxiliary_loss_mlp": 0.01116637, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00038433, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.7013090460789035, + "language_loss": 0.71600246, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73885071, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 4.08041524887085 + }, + { + "auxiliary_loss_clip": 0.01152414, + "auxiliary_loss_mlp": 0.00747803, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.00100839, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 3.348248095762293, + "language_loss": 0.82855403, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84755623, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.5881733894348145 + }, + { + "auxiliary_loss_clip": 0.01118235, + "auxiliary_loss_mlp": 0.01117377, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00055206, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.7109155542247594, + "language_loss": 0.73556364, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75791973, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.689523696899414 + }, + { + "auxiliary_loss_clip": 0.0108981, + "auxiliary_loss_mlp": 0.01116419, + "balance_loss_clip": 1.00175548, + "balance_loss_mlp": 1.00054741, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.5702940435727357, + "language_loss": 0.7660588, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.7881211, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.7195003032684326 + }, + { + "auxiliary_loss_clip": 0.0115328, + "auxiliary_loss_mlp": 0.01116329, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.0006485, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4170282005513934, + "language_loss": 0.72357827, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74627435, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.537501335144043 + }, + { + "auxiliary_loss_clip": 0.01099199, + "auxiliary_loss_mlp": 0.01096363, + "balance_loss_clip": 1.00154853, + "balance_loss_mlp": 1.00023305, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7497172942322721, + "language_loss": 0.63699341, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65894902, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 4.622269630432129 + }, + { + "auxiliary_loss_clip": 0.01117999, + "auxiliary_loss_mlp": 0.01116292, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00089693, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.719006795268674, + "language_loss": 0.77629477, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79863763, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.605545997619629 + }, + { + "auxiliary_loss_clip": 0.01136259, + "auxiliary_loss_mlp": 0.01116664, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00069678, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 2.2152712486551835, + "language_loss": 0.70638782, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.728917, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.5917370319366455 + }, + { + "auxiliary_loss_clip": 0.01153328, + "auxiliary_loss_mlp": 0.01117531, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00070643, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.8378940071440562, + "language_loss": 0.71955383, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74226248, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 3.938127279281616 + }, + { + "auxiliary_loss_clip": 0.01168297, + "auxiliary_loss_mlp": 0.01117103, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.0006597, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 2.1857670587064764, + "language_loss": 0.83768272, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86053669, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.5149035453796387 + }, + { + "auxiliary_loss_clip": 0.01151521, + "auxiliary_loss_mlp": 0.01115158, + "balance_loss_clip": 1.00184214, + "balance_loss_mlp": 1.00052631, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.3333342693193035, + "language_loss": 0.80747712, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83014387, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.5941474437713623 + }, + { + "auxiliary_loss_clip": 0.01118513, + "auxiliary_loss_mlp": 0.01116836, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00067902, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 2.4820577982932033, + "language_loss": 0.79730457, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81965804, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 4.019895076751709 + }, + { + "auxiliary_loss_clip": 0.01136478, + "auxiliary_loss_mlp": 0.00747921, + "balance_loss_clip": 1.00196421, + "balance_loss_mlp": 1.00105715, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.4165836609014013, + "language_loss": 0.71137476, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73021877, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.611473321914673 + }, + { + "auxiliary_loss_clip": 0.01138497, + "auxiliary_loss_mlp": 0.01116147, + "balance_loss_clip": 1.00205994, + "balance_loss_mlp": 1.00075245, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.504508787597113, + "language_loss": 0.70690584, + "learning_rate": 2.048483229511158e-06, + "loss": 0.72945231, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.636420488357544 + }, + { + "auxiliary_loss_clip": 0.011514, + "auxiliary_loss_mlp": 0.00747923, + "balance_loss_clip": 1.00199044, + "balance_loss_mlp": 1.00101757, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.422904577881047, + "language_loss": 0.63510537, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65409863, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.560415744781494 + }, + { + "auxiliary_loss_clip": 0.01102577, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_clip": 1.00168478, + "balance_loss_mlp": 1.0006249, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.804323170699342, + "language_loss": 0.715123, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73729944, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.7501730918884277 + }, + { + "auxiliary_loss_clip": 0.01075452, + "auxiliary_loss_mlp": 0.01116283, + "balance_loss_clip": 1.00196898, + "balance_loss_mlp": 1.00060201, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.2046198214301023, + "language_loss": 0.62090182, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64281917, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.8640859127044678 + }, + { + "auxiliary_loss_clip": 0.01119089, + "auxiliary_loss_mlp": 0.01115157, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00052512, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.6537276359158268, + "language_loss": 0.64034283, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66268528, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 2.7373828887939453 + }, + { + "auxiliary_loss_clip": 0.01118068, + "auxiliary_loss_mlp": 0.01095873, + "balance_loss_clip": 1.00160682, + "balance_loss_mlp": 1.00012457, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8218894185514032, + "language_loss": 0.61945385, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64159328, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.216938018798828 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.0006634, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 2.2062493655973983, + "language_loss": 0.80516946, + "learning_rate": 2.04614711357029e-06, + "loss": 0.8275001, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.01152641, + "auxiliary_loss_mlp": 0.0111512, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00058353, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.3192134461595482, + "language_loss": 0.70596361, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72864127, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.642317771911621 + }, + { + "auxiliary_loss_clip": 0.01168098, + "auxiliary_loss_mlp": 0.007477, + "balance_loss_clip": 1.00210059, + "balance_loss_mlp": 1.00091243, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.4472972686135528, + "language_loss": 0.71959722, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73875517, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.701655864715576 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_clip": 1.00178969, + "balance_loss_mlp": 1.00060344, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4454949708516691, + "language_loss": 0.72739565, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74990427, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.616727828979492 + }, + { + "auxiliary_loss_clip": 0.01168049, + "auxiliary_loss_mlp": 0.01115705, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.0005964, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.731703327913446, + "language_loss": 0.77070743, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79354495, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.5375688076019287 + }, + { + "auxiliary_loss_clip": 0.01168169, + "auxiliary_loss_mlp": 0.01116373, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.00078797, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.793919973480001, + "language_loss": 0.85031044, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87315583, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.5086143016815186 + }, + { + "auxiliary_loss_clip": 0.01168289, + "auxiliary_loss_mlp": 0.01117214, + "balance_loss_clip": 1.0020467, + "balance_loss_mlp": 1.0006752, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.6845749046927727, + "language_loss": 0.78221107, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80506611, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.4964098930358887 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.01114713, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00065398, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 2.0792657137250363, + "language_loss": 0.76679742, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78913659, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.682434320449829 + }, + { + "auxiliary_loss_clip": 0.01136457, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_clip": 1.00198591, + "balance_loss_mlp": 1.00083208, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.5534064663431206, + "language_loss": 0.89572394, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91825551, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.5988922119140625 + }, + { + "auxiliary_loss_clip": 0.0113636, + "auxiliary_loss_mlp": 0.00747904, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00096869, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.8013138090219052, + "language_loss": 0.62444532, + "learning_rate": 2.042642822537149e-06, + "loss": 0.6432879, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.5952467918395996 + }, + { + "auxiliary_loss_clip": 0.01147448, + "auxiliary_loss_mlp": 0.01096426, + "balance_loss_clip": 1.00135207, + "balance_loss_mlp": 1.00029552, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 1.0053958930006186, + "language_loss": 0.624089, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64652777, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.009152889251709 + }, + { + "auxiliary_loss_clip": 0.01153242, + "auxiliary_loss_mlp": 0.011167, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.00054216, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.6381471804049696, + "language_loss": 0.67725348, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69995284, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.554774761199951 + }, + { + "auxiliary_loss_clip": 0.01151243, + "auxiliary_loss_mlp": 0.01115664, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00055504, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.7742202805322609, + "language_loss": 0.77947664, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.8021456, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.589029312133789 + }, + { + "auxiliary_loss_clip": 0.01168398, + "auxiliary_loss_mlp": 0.01117587, + "balance_loss_clip": 1.00219536, + "balance_loss_mlp": 1.00057149, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 1.9127001770862961, + "language_loss": 0.80663955, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82949936, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.469926118850708 + }, + { + "auxiliary_loss_clip": 0.01137814, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_clip": 1.00200093, + "balance_loss_mlp": 1.00073695, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.4516741874050771, + "language_loss": 0.69030648, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71284693, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.602609872817993 + }, + { + "auxiliary_loss_clip": 0.0116796, + "auxiliary_loss_mlp": 0.01116176, + "balance_loss_clip": 1.00194514, + "balance_loss_mlp": 1.00059032, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.912191092932751, + "language_loss": 0.7597003, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78254163, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.5346271991729736 + }, + { + "auxiliary_loss_clip": 0.01119032, + "auxiliary_loss_mlp": 0.01116259, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00057852, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 2.06402952554246, + "language_loss": 0.81281078, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83516371, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.606229782104492 + }, + { + "auxiliary_loss_clip": 0.01153205, + "auxiliary_loss_mlp": 0.01116137, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00083828, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.6888244952835592, + "language_loss": 0.75916231, + "learning_rate": 2.039527786882341e-06, + "loss": 0.7818557, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.5484442710876465 + }, + { + "auxiliary_loss_clip": 0.0114742, + "auxiliary_loss_mlp": 0.01096237, + "balance_loss_clip": 1.00141335, + "balance_loss_mlp": 1.00010681, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6844204930390227, + "language_loss": 0.59359848, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61603504, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.2378756999969482 + }, + { + "auxiliary_loss_clip": 0.01168209, + "auxiliary_loss_mlp": 0.01116005, + "balance_loss_clip": 1.0021193, + "balance_loss_mlp": 1.00061023, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.7723341254146865, + "language_loss": 0.80551285, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 3.9441328048706055 + }, + { + "auxiliary_loss_clip": 0.01153081, + "auxiliary_loss_mlp": 0.01114888, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.0004468, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.635033361200088, + "language_loss": 0.78349382, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.8061735, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.5537660121917725 + }, + { + "auxiliary_loss_clip": 0.01167893, + "auxiliary_loss_mlp": 0.01114882, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00063229, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 3.090770889522189, + "language_loss": 0.74605823, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76888603, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.51269268989563 + }, + { + "auxiliary_loss_clip": 0.01168076, + "auxiliary_loss_mlp": 0.01116289, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.6966079606321822, + "language_loss": 0.7765342, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79937786, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.4919207096099854 + }, + { + "auxiliary_loss_clip": 0.0115358, + "auxiliary_loss_mlp": 0.01116205, + "balance_loss_clip": 1.00229526, + "balance_loss_mlp": 1.00081086, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.4741362670428506, + "language_loss": 0.69549996, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71819782, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.618572473526001 + }, + { + "auxiliary_loss_clip": 0.01136179, + "auxiliary_loss_mlp": 0.01116967, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00071454, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.0385514299884404, + "language_loss": 0.73454595, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75707746, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 3.991006851196289 + }, + { + "auxiliary_loss_clip": 0.01164188, + "auxiliary_loss_mlp": 0.01096243, + "balance_loss_clip": 1.00144887, + "balance_loss_mlp": 1.00011253, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.75157723195873, + "language_loss": 0.5811739, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60377824, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.103217601776123 + }, + { + "auxiliary_loss_clip": 0.01104625, + "auxiliary_loss_mlp": 0.0111556, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00064182, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.7441397497037279, + "language_loss": 0.68845499, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71065682, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.6567656993865967 + }, + { + "auxiliary_loss_clip": 0.01134147, + "auxiliary_loss_mlp": 0.01116753, + "balance_loss_clip": 1.0019747, + "balance_loss_mlp": 1.00069118, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.941543547624801, + "language_loss": 0.85355711, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87606615, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 4.114164590835571 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01115994, + "balance_loss_clip": 1.00227094, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 1.9055765184436477, + "language_loss": 0.65091383, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67341685, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.55989146232605 + }, + { + "auxiliary_loss_clip": 0.01136521, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00076962, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.4438403052291644, + "language_loss": 0.81931239, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84185165, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.5992016792297363 + }, + { + "auxiliary_loss_clip": 0.01106793, + "auxiliary_loss_mlp": 0.01117215, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00067592, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.954016642184153, + "language_loss": 0.81061733, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83285743, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 4.1425135135650635 + }, + { + "auxiliary_loss_clip": 0.01136342, + "auxiliary_loss_mlp": 0.01116846, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00049758, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 3.2981687477196493, + "language_loss": 0.61630166, + "learning_rate": 2.034076248204082e-06, + "loss": 0.63883352, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.577495813369751 + }, + { + "auxiliary_loss_clip": 0.0115132, + "auxiliary_loss_mlp": 0.01116618, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00084233, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.9496456122438612, + "language_loss": 0.65892142, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.68160081, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.608044147491455 + }, + { + "auxiliary_loss_clip": 0.01151478, + "auxiliary_loss_mlp": 0.01115156, + "balance_loss_clip": 1.0020473, + "balance_loss_mlp": 1.00052392, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.450339901190915, + "language_loss": 0.69364232, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71630859, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.603213310241699 + }, + { + "auxiliary_loss_clip": 0.01168244, + "auxiliary_loss_mlp": 0.01116047, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00055683, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 5.8039931030607566, + "language_loss": 0.79085898, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81370193, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.57051420211792 + }, + { + "auxiliary_loss_clip": 0.01151108, + "auxiliary_loss_mlp": 0.01115809, + "balance_loss_clip": 1.00187719, + "balance_loss_mlp": 1.00079608, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5176888669936037, + "language_loss": 0.83234292, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85501206, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.5681793689727783 + }, + { + "auxiliary_loss_clip": 0.01153224, + "auxiliary_loss_mlp": 0.00747878, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00096989, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.758015135721216, + "language_loss": 0.85186565, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87087667, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.6597602367401123 + }, + { + "auxiliary_loss_clip": 0.01151362, + "auxiliary_loss_mlp": 0.01115498, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00058007, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.876828764144096, + "language_loss": 0.83182406, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85449266, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01136358, + "auxiliary_loss_mlp": 0.0111544, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00052238, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 1.9563688762327023, + "language_loss": 0.81502688, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83754492, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.5979039669036865 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01115496, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00057828, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 1.8093994651762728, + "language_loss": 0.73335218, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.75588942, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.5780696868896484 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01116402, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0005306, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.50688335710741, + "language_loss": 0.69913137, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72151375, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.6486172676086426 + }, + { + "auxiliary_loss_clip": 0.01136115, + "auxiliary_loss_mlp": 0.01115857, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.0007484, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 1.963044899529023, + "language_loss": 0.73000622, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75252593, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.607537031173706 + }, + { + "auxiliary_loss_clip": 0.01120098, + "auxiliary_loss_mlp": 0.00747947, + "balance_loss_clip": 1.00199926, + "balance_loss_mlp": 1.00110948, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.8176280896840693, + "language_loss": 0.69570661, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71438706, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.623006582260132 + }, + { + "auxiliary_loss_clip": 0.01135755, + "auxiliary_loss_mlp": 0.01116205, + "balance_loss_clip": 1.00214553, + "balance_loss_mlp": 1.00071537, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.6052761058246092, + "language_loss": 0.72760838, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75012797, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.62695574760437 + }, + { + "auxiliary_loss_clip": 0.01135876, + "auxiliary_loss_mlp": 0.01115349, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00071716, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.6052166115995763, + "language_loss": 0.80515635, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82766861, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.6044046878814697 + }, + { + "auxiliary_loss_clip": 0.01152357, + "auxiliary_loss_mlp": 0.01115119, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00058246, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.166335467068762, + "language_loss": 0.79009044, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81276518, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.553560495376587 + }, + { + "auxiliary_loss_clip": 0.01119505, + "auxiliary_loss_mlp": 0.0111584, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00063634, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.6716942483055837, + "language_loss": 0.77417904, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79653251, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.6295619010925293 + }, + { + "auxiliary_loss_clip": 0.01119905, + "auxiliary_loss_mlp": 0.01116698, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00054049, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 1.9331685541286048, + "language_loss": 0.83801401, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.86038005, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.644724130630493 + }, + { + "auxiliary_loss_clip": 0.01168117, + "auxiliary_loss_mlp": 0.01116277, + "balance_loss_clip": 1.00203645, + "balance_loss_mlp": 1.00069118, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.934079741742932, + "language_loss": 0.79359132, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81643522, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.530775308609009 + }, + { + "auxiliary_loss_clip": 0.01119154, + "auxiliary_loss_mlp": 0.01115938, + "balance_loss_clip": 1.00186503, + "balance_loss_mlp": 1.00054312, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.4605986790917345, + "language_loss": 0.78527784, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80762881, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.01151189, + "auxiliary_loss_mlp": 0.01116153, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00056791, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.731350560749631, + "language_loss": 0.78304148, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80571485, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.521746873855591 + }, + { + "auxiliary_loss_clip": 0.01168009, + "auxiliary_loss_mlp": 0.01115932, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00053716, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.5293078944489842, + "language_loss": 0.81932551, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84216487, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.5472562313079834 + }, + { + "auxiliary_loss_clip": 0.01138448, + "auxiliary_loss_mlp": 0.0074772, + "balance_loss_clip": 1.00222385, + "balance_loss_mlp": 1.00099409, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.7831734339792056, + "language_loss": 0.7057001, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72456175, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 2.5939431190490723 + }, + { + "auxiliary_loss_clip": 0.01102786, + "auxiliary_loss_mlp": 0.01115336, + "balance_loss_clip": 1.00183713, + "balance_loss_mlp": 1.00051343, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.428639541135715, + "language_loss": 0.72035849, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.7425397, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.7774007320404053 + }, + { + "auxiliary_loss_clip": 0.01151415, + "auxiliary_loss_mlp": 0.01117527, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00060642, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.462978194547038, + "language_loss": 0.62303257, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.64572191, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.5165867805480957 + }, + { + "auxiliary_loss_clip": 0.01168121, + "auxiliary_loss_mlp": 0.01116543, + "balance_loss_clip": 1.00191033, + "balance_loss_mlp": 1.00067174, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7075095345581919, + "language_loss": 0.88046116, + "learning_rate": 2.024730186540907e-06, + "loss": 0.9033078, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.504819631576538 + }, + { + "auxiliary_loss_clip": 0.01151021, + "auxiliary_loss_mlp": 0.01115584, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.0006659, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.356595662768694, + "language_loss": 0.82452059, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84718668, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 4.120879411697388 + }, + { + "auxiliary_loss_clip": 0.01130499, + "auxiliary_loss_mlp": 0.01095575, + "balance_loss_clip": 1.00131941, + "balance_loss_mlp": 1.00020778, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8474381803116949, + "language_loss": 0.63829744, + "learning_rate": 2.023951320871339e-06, + "loss": 0.66055822, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.175325870513916 + }, + { + "auxiliary_loss_clip": 0.01121199, + "auxiliary_loss_mlp": 0.00747644, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00089455, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 1.6139448960155156, + "language_loss": 0.84222305, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86091149, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.6776647567749023 + }, + { + "auxiliary_loss_clip": 0.01151409, + "auxiliary_loss_mlp": 0.01115625, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.8126866786934175, + "language_loss": 0.75142252, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77409285, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.6048591136932373 + }, + { + "auxiliary_loss_clip": 0.01167984, + "auxiliary_loss_mlp": 0.01116512, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00064075, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 2.1917437590287405, + "language_loss": 0.57779789, + "learning_rate": 2.022783015592131e-06, + "loss": 0.6006428, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.5315728187561035 + }, + { + "auxiliary_loss_clip": 0.01151631, + "auxiliary_loss_mlp": 0.01116797, + "balance_loss_clip": 1.00207019, + "balance_loss_mlp": 1.00083041, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.8109012682424541, + "language_loss": 0.85510802, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87779236, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.5498926639556885 + }, + { + "auxiliary_loss_clip": 0.01119176, + "auxiliary_loss_mlp": 0.00747834, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00100875, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.5306876806450764, + "language_loss": 0.72285163, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74152178, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 4.047066926956177 + }, + { + "auxiliary_loss_clip": 0.01167838, + "auxiliary_loss_mlp": 0.00747827, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00103617, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.6688341491086762, + "language_loss": 0.76373363, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78289032, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.5099077224731445 + }, + { + "auxiliary_loss_clip": 0.01168048, + "auxiliary_loss_mlp": 0.01115171, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00072992, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.45844760699591, + "language_loss": 0.71571684, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73854905, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 4.05018949508667 + }, + { + "auxiliary_loss_clip": 0.01136381, + "auxiliary_loss_mlp": 0.01116521, + "balance_loss_clip": 1.00206959, + "balance_loss_mlp": 1.00064933, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 1.956262770839915, + "language_loss": 0.66575116, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68828017, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.589942216873169 + }, + { + "auxiliary_loss_clip": 0.01088646, + "auxiliary_loss_mlp": 0.01116524, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00065219, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 1.6485661089978558, + "language_loss": 0.66533732, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68738902, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.725951910018921 + }, + { + "auxiliary_loss_clip": 0.01118078, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_clip": 1.00190938, + "balance_loss_mlp": 1.00057149, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.131338774088232, + "language_loss": 0.69028032, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71261501, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.6473355293273926 + }, + { + "auxiliary_loss_clip": 0.01167908, + "auxiliary_loss_mlp": 0.011152, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00056815, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.5082527470237326, + "language_loss": 0.65923578, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68206686, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 3.958070755004883 + }, + { + "auxiliary_loss_clip": 0.01150988, + "auxiliary_loss_mlp": 0.01115514, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.00069189, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 1.9234528461338822, + "language_loss": 0.75882632, + "learning_rate": 2.019278054696955e-06, + "loss": 0.7814914, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.5788979530334473 + }, + { + "auxiliary_loss_clip": 0.01136548, + "auxiliary_loss_mlp": 0.01116155, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.00076056, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 2.3671061750295133, + "language_loss": 0.78144193, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80396903, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.5612986087799072 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00071752, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 2.0885953305483063, + "language_loss": 0.7412734, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76396263, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.6393136978149414 + }, + { + "auxiliary_loss_clip": 0.01151514, + "auxiliary_loss_mlp": 0.01116223, + "balance_loss_clip": 1.00175381, + "balance_loss_mlp": 1.00073349, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.544325140278679, + "language_loss": 0.78370333, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80638075, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.5379014015197754 + }, + { + "auxiliary_loss_clip": 0.01167993, + "auxiliary_loss_mlp": 0.01115835, + "balance_loss_clip": 1.00208342, + "balance_loss_mlp": 1.00063086, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.430787207086, + "language_loss": 0.79516459, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81800288, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.5530378818511963 + }, + { + "auxiliary_loss_clip": 0.01136446, + "auxiliary_loss_mlp": 0.01117012, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.0007596, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.611081322409283, + "language_loss": 0.81573969, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83827424, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.5612683296203613 + }, + { + "auxiliary_loss_clip": 0.01152452, + "auxiliary_loss_mlp": 0.01115354, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.00053132, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.7266544134882884, + "language_loss": 0.68351233, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70619035, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.582611560821533 + }, + { + "auxiliary_loss_clip": 0.01118791, + "auxiliary_loss_mlp": 0.01118032, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00063419, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.9485283587732285, + "language_loss": 0.62036741, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64273566, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.6717586517333984 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01116083, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00068796, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 1.8150781606804836, + "language_loss": 0.77752149, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.79989094, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.6253273487091064 + }, + { + "auxiliary_loss_clip": 0.01136423, + "auxiliary_loss_mlp": 0.01115794, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00068581, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.6433367934648833, + "language_loss": 0.74643832, + "learning_rate": 2.015773034588706e-06, + "loss": 0.76896048, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.5819058418273926 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00068545, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.4717327483699822, + "language_loss": 0.74490392, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76743346, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.7105765342712402 + }, + { + "auxiliary_loss_clip": 0.01152633, + "auxiliary_loss_mlp": 0.01116047, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.00065219, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.5831357199033342, + "language_loss": 0.65310973, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67579651, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.5680956840515137 + }, + { + "auxiliary_loss_clip": 0.01134023, + "auxiliary_loss_mlp": 0.01114728, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00066805, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4254588467003915, + "language_loss": 0.74340552, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76589304, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.6005542278289795 + }, + { + "auxiliary_loss_clip": 0.01151171, + "auxiliary_loss_mlp": 0.01116001, + "balance_loss_clip": 1.00181091, + "balance_loss_mlp": 1.00060654, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.7944708333693156, + "language_loss": 0.82619774, + "learning_rate": 2.014215231682995e-06, + "loss": 0.84886944, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.560171127319336 + }, + { + "auxiliary_loss_clip": 0.01102688, + "auxiliary_loss_mlp": 0.01114144, + "balance_loss_clip": 1.00181329, + "balance_loss_mlp": 1.00056124, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6967511019319754, + "language_loss": 0.73753726, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75970566, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.653311252593994 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01096305, + "balance_loss_clip": 1.00146151, + "balance_loss_mlp": 1.0001744, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7522008459804563, + "language_loss": 0.60755467, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62967646, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.2883071899414062 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01116107, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00061679, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6647005679440399, + "language_loss": 0.76533234, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78785849, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.582623243331909 + }, + { + "auxiliary_loss_clip": 0.0113451, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00062132, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.1984714479883, + "language_loss": 0.67614895, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69865131, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.7139008045196533 + }, + { + "auxiliary_loss_clip": 0.01117664, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_clip": 1.00174344, + "balance_loss_mlp": 1.00081098, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.9018547283284457, + "language_loss": 0.81990469, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84224343, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.6215643882751465 + }, + { + "auxiliary_loss_clip": 0.01153042, + "auxiliary_loss_mlp": 0.01116719, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00075233, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 9.821343576163406, + "language_loss": 0.63795942, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66065705, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.5853140354156494 + }, + { + "auxiliary_loss_clip": 0.01151171, + "auxiliary_loss_mlp": 0.01115642, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00072432, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.70343882246223, + "language_loss": 0.69963235, + "learning_rate": 2.011489056413418e-06, + "loss": 0.72230053, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.546344041824341 + }, + { + "auxiliary_loss_clip": 0.01151217, + "auxiliary_loss_mlp": 0.01115851, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00083816, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.524606686902319, + "language_loss": 0.70981658, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73248732, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 2.5486702919006348 + }, + { + "auxiliary_loss_clip": 0.01104295, + "auxiliary_loss_mlp": 0.01114959, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00061297, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 2.653107037842877, + "language_loss": 0.80240893, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82460153, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.635806083679199 + }, + { + "auxiliary_loss_clip": 0.01151055, + "auxiliary_loss_mlp": 0.01114929, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.0006783, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.5961859177287498, + "language_loss": 0.78587568, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80853552, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.6035006046295166 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01116306, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00052977, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.5954607016066742, + "language_loss": 0.76055568, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78308195, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 4.097897529602051 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.01117337, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00060713, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 1.6991533928760088, + "language_loss": 0.74626517, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76845348, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.6410186290740967 + }, + { + "auxiliary_loss_clip": 0.01087108, + "auxiliary_loss_mlp": 0.01115157, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.0006212, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.6101886154030987, + "language_loss": 0.70412427, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72614694, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.6933364868164062 + }, + { + "auxiliary_loss_clip": 0.01135786, + "auxiliary_loss_mlp": 0.01116462, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00059032, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.8320392386721716, + "language_loss": 0.79605532, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81857783, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.607295036315918 + }, + { + "auxiliary_loss_clip": 0.01135877, + "auxiliary_loss_mlp": 0.01116064, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00086021, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.8402597647604126, + "language_loss": 0.67986417, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70238352, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.6486995220184326 + }, + { + "auxiliary_loss_clip": 0.01123135, + "auxiliary_loss_mlp": 0.0111624, + "balance_loss_clip": 1.00186503, + "balance_loss_mlp": 1.00055945, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.0722773522971094, + "language_loss": 0.72391343, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74630719, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.622802734375 + }, + { + "auxiliary_loss_clip": 0.01151249, + "auxiliary_loss_mlp": 0.01116137, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00074244, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.9385387668409186, + "language_loss": 0.82174242, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84441626, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 3.911219596862793 + }, + { + "auxiliary_loss_clip": 0.01153146, + "auxiliary_loss_mlp": 0.01116225, + "balance_loss_clip": 1.00197232, + "balance_loss_mlp": 1.00073469, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6656600902921574, + "language_loss": 0.73732418, + "learning_rate": 2.007205025522544e-06, + "loss": 0.76001787, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.567208766937256 + }, + { + "auxiliary_loss_clip": 0.01152465, + "auxiliary_loss_mlp": 0.0111564, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00081801, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.6461888890134042, + "language_loss": 0.73658729, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.7592684, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01116164, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00067437, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.711633191310462, + "language_loss": 0.82153201, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.8439151, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.6212515830993652 + }, + { + "auxiliary_loss_clip": 0.01153075, + "auxiliary_loss_mlp": 0.01113868, + "balance_loss_clip": 1.00206685, + "balance_loss_mlp": 1.00047565, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 2.4748285397990353, + "language_loss": 0.71570891, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.73837835, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 3.9499893188476562 + }, + { + "auxiliary_loss_clip": 0.01151648, + "auxiliary_loss_mlp": 0.01116549, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00067747, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.4960623271077766, + "language_loss": 0.75277925, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.7754612, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.556204319000244 + }, + { + "auxiliary_loss_clip": 0.01135615, + "auxiliary_loss_mlp": 0.01115628, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00051928, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.7273008443517528, + "language_loss": 0.68886995, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71138239, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.618809223175049 + }, + { + "auxiliary_loss_clip": 0.01151177, + "auxiliary_loss_mlp": 0.01115555, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00063729, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7848415902026107, + "language_loss": 0.74800551, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77067286, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 3.9606471061706543 + }, + { + "auxiliary_loss_clip": 0.01167982, + "auxiliary_loss_mlp": 0.01115866, + "balance_loss_clip": 1.00200999, + "balance_loss_mlp": 1.00066245, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.9092339034659611, + "language_loss": 0.6778599, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70069838, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.5509116649627686 + }, + { + "auxiliary_loss_clip": 0.01153109, + "auxiliary_loss_mlp": 0.01116825, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00076294, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 4.925338574903895, + "language_loss": 0.73685908, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75955844, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.566077470779419 + }, + { + "auxiliary_loss_clip": 0.01118977, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00069046, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.6602540786102864, + "language_loss": 0.74927497, + "learning_rate": 2.003699883863633e-06, + "loss": 0.77161992, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.597696542739868 + }, + { + "auxiliary_loss_clip": 0.01120351, + "auxiliary_loss_mlp": 0.01114846, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00069094, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7982448270214915, + "language_loss": 0.86123955, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88359153, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.6183149814605713 + }, + { + "auxiliary_loss_clip": 0.0115278, + "auxiliary_loss_mlp": 0.01115443, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00071549, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.599353311135068, + "language_loss": 0.89098227, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91366458, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.01167865, + "auxiliary_loss_mlp": 0.00747789, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00098085, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.8991776517570025, + "language_loss": 0.65035778, + "learning_rate": 2.002531500253602e-06, + "loss": 0.6695143, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.5062458515167236 + }, + { + "auxiliary_loss_clip": 0.01153397, + "auxiliary_loss_mlp": 0.00747913, + "balance_loss_clip": 1.00221109, + "balance_loss_mlp": 1.00108421, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.61681108436743, + "language_loss": 0.63246083, + "learning_rate": 2.002142038838577e-06, + "loss": 0.651474, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.599027156829834 + }, + { + "auxiliary_loss_clip": 0.01168002, + "auxiliary_loss_mlp": 0.01115837, + "balance_loss_clip": 1.00202179, + "balance_loss_mlp": 1.00053787, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 2.858223376658484, + "language_loss": 0.70196784, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72480625, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.5314693450927734 + }, + { + "auxiliary_loss_clip": 0.01138144, + "auxiliary_loss_mlp": 0.01116185, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00050426, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5333907393551982, + "language_loss": 0.66918409, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.6917274, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.621095657348633 + }, + { + "auxiliary_loss_clip": 0.0115269, + "auxiliary_loss_mlp": 0.01116545, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.00067341, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.5955219073843918, + "language_loss": 0.77693039, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79962271, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.5628139972686768 + }, + { + "auxiliary_loss_clip": 0.01151427, + "auxiliary_loss_mlp": 0.01117427, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00060225, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1093719479992616, + "language_loss": 0.82369065, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84637916, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.5428900718688965 + }, + { + "auxiliary_loss_clip": 0.01136002, + "auxiliary_loss_mlp": 0.01117467, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00064206, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.8391058003784095, + "language_loss": 0.73247242, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75500703, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.600583553314209 + }, + { + "auxiliary_loss_clip": 0.01153267, + "auxiliary_loss_mlp": 0.01117361, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00063157, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.6115412337745176, + "language_loss": 0.68034422, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70305049, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 2.653404474258423 + }, + { + "auxiliary_loss_clip": 0.01168114, + "auxiliary_loss_mlp": 0.00747976, + "balance_loss_clip": 1.00196874, + "balance_loss_mlp": 1.00117922, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.5547106785484857, + "language_loss": 0.77766204, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.79682291, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.547945737838745 + }, + { + "auxiliary_loss_clip": 0.01151329, + "auxiliary_loss_mlp": 0.01117147, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00051296, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 1.8228806076053907, + "language_loss": 0.79207516, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81475997, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.5849249362945557 + }, + { + "auxiliary_loss_clip": 0.01135395, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00065529, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.0825726695225097, + "language_loss": 0.9084959, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93100369, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.5947399139404297 + }, + { + "auxiliary_loss_clip": 0.01168146, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00064886, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.5297193174427732, + "language_loss": 0.76629061, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78914207, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.522782802581787 + }, + { + "auxiliary_loss_clip": 0.01151574, + "auxiliary_loss_mlp": 0.01116863, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00070536, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.574293637426689, + "language_loss": 0.74069834, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76338267, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.6935694217681885 + }, + { + "auxiliary_loss_clip": 0.01132296, + "auxiliary_loss_mlp": 0.01097102, + "balance_loss_clip": 1.00133443, + "balance_loss_mlp": 1.00020909, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7748760762167729, + "language_loss": 0.52908075, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.55137479, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.244257926940918 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01116753, + "balance_loss_clip": 1.00219917, + "balance_loss_mlp": 1.00069082, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.5389221907578265, + "language_loss": 0.75497979, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.77765918, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.574103832244873 + }, + { + "auxiliary_loss_clip": 0.01150848, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_clip": 1.00198328, + "balance_loss_mlp": 1.00064909, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.6374709280368738, + "language_loss": 0.76953232, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79220128, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.5606775283813477 + }, + { + "auxiliary_loss_clip": 0.01138085, + "auxiliary_loss_mlp": 0.01115043, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00069785, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.7234848941719594, + "language_loss": 0.85389602, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87642729, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.607280969619751 + }, + { + "auxiliary_loss_clip": 0.0115317, + "auxiliary_loss_mlp": 0.01117182, + "balance_loss_clip": 1.00192893, + "balance_loss_mlp": 1.00064301, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.471627136495233, + "language_loss": 0.76400334, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78670686, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.5757834911346436 + }, + { + "auxiliary_loss_clip": 0.01103464, + "auxiliary_loss_mlp": 0.00747991, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.0010922, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.632205802405703, + "language_loss": 0.75809956, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77661407, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.655679702758789 + }, + { + "auxiliary_loss_clip": 0.0113577, + "auxiliary_loss_mlp": 0.01117437, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00061202, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 2.0400131587122967, + "language_loss": 0.81120169, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83373374, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.620774984359741 + }, + { + "auxiliary_loss_clip": 0.01168064, + "auxiliary_loss_mlp": 0.01115663, + "balance_loss_clip": 1.00199163, + "balance_loss_mlp": 1.00064957, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.887762947906875, + "language_loss": 0.76103437, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78387165, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 4.08305025100708 + }, + { + "auxiliary_loss_clip": 0.01119809, + "auxiliary_loss_mlp": 0.01116098, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00060773, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.4794105087064917, + "language_loss": 0.79029113, + "learning_rate": 1.994352813122559e-06, + "loss": 0.8126502, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.642456293106079 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01117823, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00071144, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 1.981565964836273, + "language_loss": 0.73072731, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75311422, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.6078550815582275 + }, + { + "auxiliary_loss_clip": 0.01151233, + "auxiliary_loss_mlp": 0.01116418, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.0006423, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 1.9889827509820686, + "language_loss": 0.74726665, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76994318, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.4984865188598633 + }, + { + "auxiliary_loss_clip": 0.01119561, + "auxiliary_loss_mlp": 0.01115938, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.0006392, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 2.2922353513787104, + "language_loss": 0.65931839, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68167335, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.639981508255005 + }, + { + "auxiliary_loss_clip": 0.01153408, + "auxiliary_loss_mlp": 0.01117055, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00070679, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4550510144967639, + "language_loss": 0.75844979, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78115439, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.5683228969573975 + }, + { + "auxiliary_loss_clip": 0.0111966, + "auxiliary_loss_mlp": 0.01116716, + "balance_loss_clip": 1.00182498, + "balance_loss_mlp": 1.00074959, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 2.301126722211942, + "language_loss": 0.78568441, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80804819, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 4.007986545562744 + }, + { + "auxiliary_loss_clip": 0.01151865, + "auxiliary_loss_mlp": 0.01115746, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.00063789, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.122331083502587, + "language_loss": 0.817729, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.84040511, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.5297398567199707 + }, + { + "auxiliary_loss_clip": 0.01134585, + "auxiliary_loss_mlp": 0.01116431, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00075078, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 2.5544556564377627, + "language_loss": 0.71642685, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73893696, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.5876662731170654 + }, + { + "auxiliary_loss_clip": 0.0114937, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_clip": 1.00146914, + "balance_loss_mlp": 1.00025892, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 1.0562760460710385, + "language_loss": 0.57829034, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.60075176, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 4.655359268188477 + }, + { + "auxiliary_loss_clip": 0.01136502, + "auxiliary_loss_mlp": 0.01117138, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.0007894, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.8690277959280308, + "language_loss": 0.75148559, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77402198, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.572366714477539 + }, + { + "auxiliary_loss_clip": 0.01152606, + "auxiliary_loss_mlp": 0.01116052, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00056183, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.6750297081592649, + "language_loss": 0.67599487, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69868135, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.5556628704071045 + }, + { + "auxiliary_loss_clip": 0.01148022, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_clip": 1.00143957, + "balance_loss_mlp": 1.00010478, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.8391856699510757, + "language_loss": 0.55855125, + "learning_rate": 1.990068767935895e-06, + "loss": 0.58099759, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 4.364866256713867 + }, + { + "auxiliary_loss_clip": 0.01136228, + "auxiliary_loss_mlp": 0.01114901, + "balance_loss_clip": 1.00188434, + "balance_loss_mlp": 1.00074565, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 4.993960213393795, + "language_loss": 0.817541, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.84005231, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.596062421798706 + }, + { + "auxiliary_loss_clip": 0.01153074, + "auxiliary_loss_mlp": 0.01115431, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00051367, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 2.1874404489237675, + "language_loss": 0.83139634, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85408139, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.5597338676452637 + }, + { + "auxiliary_loss_clip": 0.01135989, + "auxiliary_loss_mlp": 0.01116138, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00074387, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.7015964684950038, + "language_loss": 0.69527006, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71779132, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.6910033226013184 + }, + { + "auxiliary_loss_clip": 0.01119886, + "auxiliary_loss_mlp": 0.01115881, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00067687, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.4464332880068789, + "language_loss": 0.77384901, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79620665, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.662935495376587 + }, + { + "auxiliary_loss_clip": 0.01168023, + "auxiliary_loss_mlp": 0.01116235, + "balance_loss_clip": 1.00203264, + "balance_loss_mlp": 1.00074506, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.4259311128124332, + "language_loss": 0.65125144, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67409396, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.530031442642212 + }, + { + "auxiliary_loss_clip": 0.01120747, + "auxiliary_loss_mlp": 0.01116628, + "balance_loss_clip": 1.00221562, + "balance_loss_mlp": 1.00056601, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.4943257274562434, + "language_loss": 0.75575852, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77813232, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.6458702087402344 + }, + { + "auxiliary_loss_clip": 0.01167924, + "auxiliary_loss_mlp": 0.01116708, + "balance_loss_clip": 1.00195801, + "balance_loss_mlp": 1.00045466, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.5768063419247462, + "language_loss": 0.81006825, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83291453, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.5437376499176025 + }, + { + "auxiliary_loss_clip": 0.011043, + "auxiliary_loss_mlp": 0.01116644, + "balance_loss_clip": 1.00175083, + "balance_loss_mlp": 1.00077248, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.4508766876662134, + "language_loss": 0.75454032, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77674979, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.6980998516082764 + }, + { + "auxiliary_loss_clip": 0.01137658, + "auxiliary_loss_mlp": 0.01115261, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00062978, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.1834518647480228, + "language_loss": 0.72035182, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.742881, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.619264841079712 + }, + { + "auxiliary_loss_clip": 0.01121409, + "auxiliary_loss_mlp": 0.01115504, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.0004909, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 3.1786640685695593, + "language_loss": 0.74564517, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76801431, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.6354258060455322 + }, + { + "auxiliary_loss_clip": 0.01151117, + "auxiliary_loss_mlp": 0.01117174, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00073063, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.248149435452107, + "language_loss": 0.83285379, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.8555367, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.565786838531494 + }, + { + "auxiliary_loss_clip": 0.01167952, + "auxiliary_loss_mlp": 0.01116983, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00072992, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.7289494699919676, + "language_loss": 0.74547887, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76832831, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.557715892791748 + }, + { + "auxiliary_loss_clip": 0.01134731, + "auxiliary_loss_mlp": 0.01116216, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00072622, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 1.9698950496595875, + "language_loss": 0.72730249, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74981195, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.5671017169952393 + }, + { + "auxiliary_loss_clip": 0.01134491, + "auxiliary_loss_mlp": 0.0111659, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00062346, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.8119409220448353, + "language_loss": 0.8510102, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87352097, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.579025983810425 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01115499, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.0004859, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4190115933050023, + "language_loss": 0.64426887, + "learning_rate": 1.984226965411294e-06, + "loss": 0.6669383, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.6181695461273193 + }, + { + "auxiliary_loss_clip": 0.01134733, + "auxiliary_loss_mlp": 0.01115963, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00047326, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.4505654365618517, + "language_loss": 0.7805748, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80308175, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.5584676265716553 + }, + { + "auxiliary_loss_clip": 0.01150742, + "auxiliary_loss_mlp": 0.01115462, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00064003, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.8334566740405558, + "language_loss": 0.71992475, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74258685, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.55411696434021 + }, + { + "auxiliary_loss_clip": 0.01153214, + "auxiliary_loss_mlp": 0.01116836, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00067854, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.7665137576755017, + "language_loss": 0.86517841, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88787889, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.5541915893554688 + }, + { + "auxiliary_loss_clip": 0.01153165, + "auxiliary_loss_mlp": 0.01115123, + "balance_loss_clip": 1.00197852, + "balance_loss_mlp": 1.00058699, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.5300066334181792, + "language_loss": 0.73591501, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75859785, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.5781033039093018 + }, + { + "auxiliary_loss_clip": 0.01168038, + "auxiliary_loss_mlp": 0.0111653, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00046778, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8439565936369353, + "language_loss": 0.6717698, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69461548, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.4830543994903564 + }, + { + "auxiliary_loss_clip": 0.01167942, + "auxiliary_loss_mlp": 0.01115836, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00053632, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 3.287965281682991, + "language_loss": 0.77458984, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79742765, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.542445659637451 + }, + { + "auxiliary_loss_clip": 0.01152626, + "auxiliary_loss_mlp": 0.0111568, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.00066662, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.850715604724664, + "language_loss": 0.82184362, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84452665, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.5274930000305176 + }, + { + "auxiliary_loss_clip": 0.01168123, + "auxiliary_loss_mlp": 0.01118, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00079322, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.4717821105297495, + "language_loss": 0.66564637, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68850756, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.5419082641601562 + }, + { + "auxiliary_loss_clip": 0.01135537, + "auxiliary_loss_mlp": 0.01116582, + "balance_loss_clip": 1.00200284, + "balance_loss_mlp": 1.00051975, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 2.616703467815367, + "language_loss": 0.8683663, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.8908875, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.5260090827941895 + }, + { + "auxiliary_loss_clip": 0.01151058, + "auxiliary_loss_mlp": 0.01115204, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00076318, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.4436881716983871, + "language_loss": 0.81030917, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83297181, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 3.9340097904205322 + }, + { + "auxiliary_loss_clip": 0.01151345, + "auxiliary_loss_mlp": 0.0074784, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00108135, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 2.3244794184075936, + "language_loss": 0.75094271, + "learning_rate": 1.9799430596079e-06, + "loss": 0.76993454, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.583479166030884 + }, + { + "auxiliary_loss_clip": 0.01168114, + "auxiliary_loss_mlp": 0.01116613, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00074196, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.6164040034922627, + "language_loss": 0.70368165, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72652894, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.4748170375823975 + }, + { + "auxiliary_loss_clip": 0.01147596, + "auxiliary_loss_mlp": 0.01096253, + "balance_loss_clip": 1.00142288, + "balance_loss_mlp": 1.00012231, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.958121216073791, + "language_loss": 0.67256373, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69500226, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.0731282234191895 + }, + { + "auxiliary_loss_clip": 0.01102108, + "auxiliary_loss_mlp": 0.01114882, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00063157, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.1085969530879476, + "language_loss": 0.79150629, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81367624, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.6470539569854736 + }, + { + "auxiliary_loss_clip": 0.01087975, + "auxiliary_loss_mlp": 0.00747819, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00108683, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 3.057624425776229, + "language_loss": 0.82321572, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.8415736, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 4.1279988288879395 + }, + { + "auxiliary_loss_clip": 0.01138304, + "auxiliary_loss_mlp": 0.01115482, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00075459, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.8877034658450014, + "language_loss": 0.65806162, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68059945, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.597041606903076 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01116451, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.0005796, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.6857188169446087, + "language_loss": 0.606188, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62871516, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.555318593978882 + }, + { + "auxiliary_loss_clip": 0.01167931, + "auxiliary_loss_mlp": 0.01115228, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00059605, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.8523506989457315, + "language_loss": 0.76089525, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78372687, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.5088627338409424 + }, + { + "auxiliary_loss_clip": 0.01121443, + "auxiliary_loss_mlp": 0.01115702, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.0005939, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 1.7352600675162329, + "language_loss": 0.71160722, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73397863, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 4.098147630691528 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01116216, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00072598, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.7911374261970001, + "language_loss": 0.67829609, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70084018, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.5918753147125244 + }, + { + "auxiliary_loss_clip": 0.01152907, + "auxiliary_loss_mlp": 0.01115938, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00054336, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 1.7671676394201838, + "language_loss": 0.70237458, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72506309, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.5560243129730225 + }, + { + "auxiliary_loss_clip": 0.01168201, + "auxiliary_loss_mlp": 0.00748063, + "balance_loss_clip": 1.00220299, + "balance_loss_mlp": 1.00112605, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 2.1030410876482937, + "language_loss": 0.72811413, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.74727678, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 3.9902291297912598 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01114198, + "balance_loss_clip": 1.00182569, + "balance_loss_mlp": 1.00052035, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.6413768828231918, + "language_loss": 0.77419043, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79667497, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.5829219818115234 + }, + { + "auxiliary_loss_clip": 0.01153167, + "auxiliary_loss_mlp": 0.01116116, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00053096, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 1.79325472331657, + "language_loss": 0.74460614, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76729894, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.533085346221924 + }, + { + "auxiliary_loss_clip": 0.01150627, + "auxiliary_loss_mlp": 0.01116567, + "balance_loss_clip": 1.00177133, + "balance_loss_mlp": 1.00060034, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6420518097602503, + "language_loss": 0.80307484, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82574672, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.5444984436035156 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01116228, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00054753, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5048671767657709, + "language_loss": 0.74352956, + "learning_rate": 1.974101522024942e-06, + "loss": 0.7662046, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.601841449737549 + }, + { + "auxiliary_loss_clip": 0.01119556, + "auxiliary_loss_mlp": 0.01115387, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00066042, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.7760454178515879, + "language_loss": 0.78703809, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80938756, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.598743200302124 + }, + { + "auxiliary_loss_clip": 0.0115248, + "auxiliary_loss_mlp": 0.01115542, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00062418, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.5741037769446897, + "language_loss": 0.803702, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82638228, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.580547571182251 + }, + { + "auxiliary_loss_clip": 0.01167954, + "auxiliary_loss_mlp": 0.01115307, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.0008657, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 6.094722678185869, + "language_loss": 0.68576789, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70860052, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.5715224742889404 + }, + { + "auxiliary_loss_clip": 0.01136522, + "auxiliary_loss_mlp": 0.01117004, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00075102, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.496627666004057, + "language_loss": 0.77441955, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79695475, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.58270263671875 + }, + { + "auxiliary_loss_clip": 0.01167946, + "auxiliary_loss_mlp": 0.01116376, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00059974, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.135226719967106, + "language_loss": 0.71661276, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.739456, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.480945348739624 + }, + { + "auxiliary_loss_clip": 0.01117885, + "auxiliary_loss_mlp": 0.01115146, + "balance_loss_clip": 1.00180304, + "balance_loss_mlp": 1.00080025, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 1.7949109188573813, + "language_loss": 0.76039672, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.782727, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.6209592819213867 + }, + { + "auxiliary_loss_clip": 0.01119479, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00066686, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 1.9777449861895469, + "language_loss": 0.74587286, + "learning_rate": 1.971375543740272e-06, + "loss": 0.7682215, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.623809337615967 + }, + { + "auxiliary_loss_clip": 0.01168052, + "auxiliary_loss_mlp": 0.0111607, + "balance_loss_clip": 1.00215364, + "balance_loss_mlp": 1.00058019, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 2.201651311647691, + "language_loss": 0.77199692, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79483807, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.5521528720855713 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01115613, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.00059974, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.625879236405339, + "language_loss": 0.66217852, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68453139, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.5923845767974854 + }, + { + "auxiliary_loss_clip": 0.01167952, + "auxiliary_loss_mlp": 0.01115772, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00056767, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.6963892314113886, + "language_loss": 0.7637049, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78654218, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.553610324859619 + }, + { + "auxiliary_loss_clip": 0.0116784, + "auxiliary_loss_mlp": 0.01115237, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00060582, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.4667785052765672, + "language_loss": 0.83209443, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85492516, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.5348641872406006 + }, + { + "auxiliary_loss_clip": 0.01168061, + "auxiliary_loss_mlp": 0.01116286, + "balance_loss_clip": 1.00202692, + "balance_loss_mlp": 1.00070095, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.437039970633606, + "language_loss": 0.70140207, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72424549, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.5508697032928467 + }, + { + "auxiliary_loss_clip": 0.01151069, + "auxiliary_loss_mlp": 0.00747956, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00124311, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 8.496324680729355, + "language_loss": 0.80305564, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82204592, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.613368511199951 + }, + { + "auxiliary_loss_clip": 0.01167925, + "auxiliary_loss_mlp": 0.01115578, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00056481, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.7367049419086635, + "language_loss": 0.77906042, + "learning_rate": 1.968649618642264e-06, + "loss": 0.8018955, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.5073647499084473 + }, + { + "auxiliary_loss_clip": 0.01152688, + "auxiliary_loss_mlp": 0.01115774, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.0006659, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 2.167742277841337, + "language_loss": 0.66223335, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68491799, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.524702787399292 + }, + { + "auxiliary_loss_clip": 0.01167948, + "auxiliary_loss_mlp": 0.01117039, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00059557, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.628388181151762, + "language_loss": 0.71334451, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73619443, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.541470766067505 + }, + { + "auxiliary_loss_clip": 0.0113479, + "auxiliary_loss_mlp": 0.01115284, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.00055647, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.6700419241846511, + "language_loss": 0.63997394, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66247469, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.621999979019165 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.000736, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.5615772297070547, + "language_loss": 0.70449138, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72702467, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.556647300720215 + }, + { + "auxiliary_loss_clip": 0.01167842, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.0005703, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 4.426742087163909, + "language_loss": 0.77709526, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79992664, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.4996378421783447 + }, + { + "auxiliary_loss_clip": 0.0108571, + "auxiliary_loss_mlp": 0.01115686, + "balance_loss_clip": 1.00166142, + "balance_loss_mlp": 1.00076878, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 3.89758380952036, + "language_loss": 0.78416204, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80617601, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.6989636421203613 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01116652, + "balance_loss_clip": 1.00187767, + "balance_loss_mlp": 1.00078046, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.0484945193571584, + "language_loss": 0.70515978, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72752345, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.633790969848633 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01116275, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00068963, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.5145163686977992, + "language_loss": 0.78602052, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80836028, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 4.105712175369263 + }, + { + "auxiliary_loss_clip": 0.01153305, + "auxiliary_loss_mlp": 0.01117462, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00073266, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.7497562033507976, + "language_loss": 0.83883047, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86153817, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.588803768157959 + }, + { + "auxiliary_loss_clip": 0.01152332, + "auxiliary_loss_mlp": 0.01115831, + "balance_loss_clip": 1.00212979, + "balance_loss_mlp": 1.0006268, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.65870862820078, + "language_loss": 0.66240597, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68508756, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.5094544887542725 + }, + { + "auxiliary_loss_clip": 0.01121223, + "auxiliary_loss_mlp": 0.01115645, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00063181, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.516050189210927, + "language_loss": 0.73265171, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75502038, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.7026655673980713 + }, + { + "auxiliary_loss_clip": 0.01119105, + "auxiliary_loss_mlp": 0.01115985, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00068581, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.882320615104918, + "language_loss": 0.71644306, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73879391, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.6153371334075928 + }, + { + "auxiliary_loss_clip": 0.01168045, + "auxiliary_loss_mlp": 0.01116636, + "balance_loss_clip": 1.00204611, + "balance_loss_mlp": 1.00066876, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.6337539520064162, + "language_loss": 0.83082378, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85367054, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 3.89003324508667 + }, + { + "auxiliary_loss_clip": 0.01138557, + "auxiliary_loss_mlp": 0.0111787, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.00085402, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9835242401656061, + "language_loss": 0.75388116, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77644539, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.5579216480255127 + }, + { + "auxiliary_loss_clip": 0.01168034, + "auxiliary_loss_mlp": 0.01116365, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00087547, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.7249719273823878, + "language_loss": 0.77866912, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80151308, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.5592589378356934 + }, + { + "auxiliary_loss_clip": 0.01136002, + "auxiliary_loss_mlp": 0.01116407, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00063145, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.613046738226276, + "language_loss": 0.70338058, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.7259047, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.6437346935272217 + }, + { + "auxiliary_loss_clip": 0.01152997, + "auxiliary_loss_mlp": 0.01116369, + "balance_loss_clip": 1.00196207, + "balance_loss_mlp": 1.0005933, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.6027407194935095, + "language_loss": 0.69288498, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71557868, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 4.110141038894653 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.00748019, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00128317, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.9006217121482578, + "language_loss": 0.77122974, + "learning_rate": 1.961640376626072e-06, + "loss": 0.79007632, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.6097309589385986 + }, + { + "auxiliary_loss_clip": 0.01134574, + "auxiliary_loss_mlp": 0.01115378, + "balance_loss_clip": 1.00165534, + "balance_loss_mlp": 1.00074589, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 1.9987731632825025, + "language_loss": 0.76371372, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78621328, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.5606560707092285 + }, + { + "auxiliary_loss_clip": 0.01151494, + "auxiliary_loss_mlp": 0.01115506, + "balance_loss_clip": 1.0019964, + "balance_loss_mlp": 1.00068355, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.5240858871251644, + "language_loss": 0.7186389, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74130893, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 3.9770302772521973 + }, + { + "auxiliary_loss_clip": 0.01134611, + "auxiliary_loss_mlp": 0.01117155, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00052035, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.0431423865490475, + "language_loss": 0.68567777, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70819539, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.573293924331665 + }, + { + "auxiliary_loss_clip": 0.01102863, + "auxiliary_loss_mlp": 0.01115194, + "balance_loss_clip": 1.0016681, + "balance_loss_mlp": 1.00056219, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.4747654190640762, + "language_loss": 0.81042397, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83260453, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.702979326248169 + }, + { + "auxiliary_loss_clip": 0.01138053, + "auxiliary_loss_mlp": 0.01116221, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00054014, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.10260140432884, + "language_loss": 0.63679749, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65934026, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.577136516571045 + }, + { + "auxiliary_loss_clip": 0.01134046, + "auxiliary_loss_mlp": 0.00747948, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00124657, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.476277267726427, + "language_loss": 0.66697752, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68579745, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.5827267169952393 + }, + { + "auxiliary_loss_clip": 0.0111689, + "auxiliary_loss_mlp": 0.01115291, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.00065887, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.214568051854589, + "language_loss": 0.76238692, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78470868, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.6325182914733887 + }, + { + "auxiliary_loss_clip": 0.0111913, + "auxiliary_loss_mlp": 0.01116675, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00080347, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 2.0088362440516585, + "language_loss": 0.78637445, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80873251, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.5875022411346436 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01114514, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.000741, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.7873527086200405, + "language_loss": 0.71972466, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74207878, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.595932960510254 + }, + { + "auxiliary_loss_clip": 0.01152946, + "auxiliary_loss_mlp": 0.01116145, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00065494, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.4604100779842966, + "language_loss": 0.74639487, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76908576, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.527353525161743 + }, + { + "auxiliary_loss_clip": 0.01136103, + "auxiliary_loss_mlp": 0.01116214, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00072384, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.2066744730056027, + "language_loss": 0.8584159, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88093907, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.5664615631103516 + }, + { + "auxiliary_loss_clip": 0.01132827, + "auxiliary_loss_mlp": 0.01096392, + "balance_loss_clip": 1.00164902, + "balance_loss_mlp": 1.00026131, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8840906453458998, + "language_loss": 0.63108242, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65337455, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.1063618659973145 + }, + { + "auxiliary_loss_clip": 0.01150701, + "auxiliary_loss_mlp": 0.01114963, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00071263, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.613494311554942, + "language_loss": 0.68941057, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71206725, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.584702730178833 + }, + { + "auxiliary_loss_clip": 0.01153137, + "auxiliary_loss_mlp": 0.01115395, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00066757, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5265669777882842, + "language_loss": 0.64838064, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67106593, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.591284990310669 + }, + { + "auxiliary_loss_clip": 0.01138229, + "auxiliary_loss_mlp": 0.01116969, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00062108, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 1.9756075010338707, + "language_loss": 0.68619704, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70874906, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.5836684703826904 + }, + { + "auxiliary_loss_clip": 0.01168045, + "auxiliary_loss_mlp": 0.01116348, + "balance_loss_clip": 1.00213337, + "balance_loss_mlp": 1.00085795, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.7228075718382012, + "language_loss": 0.6706506, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.6934945, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.506898880004883 + }, + { + "auxiliary_loss_clip": 0.01168037, + "auxiliary_loss_mlp": 0.01116412, + "balance_loss_clip": 1.00208199, + "balance_loss_mlp": 1.00063562, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.778895014589639, + "language_loss": 0.83277678, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85562122, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.4751393795013428 + }, + { + "auxiliary_loss_clip": 0.01134121, + "auxiliary_loss_mlp": 0.01114965, + "balance_loss_clip": 1.00179183, + "balance_loss_mlp": 1.00061965, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.6214225664592519, + "language_loss": 0.77218539, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79467618, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.613811731338501 + }, + { + "auxiliary_loss_clip": 0.01121615, + "auxiliary_loss_mlp": 0.01116503, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00082231, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.8316914997160263, + "language_loss": 0.69164562, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71402681, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.7276551723480225 + }, + { + "auxiliary_loss_clip": 0.01119476, + "auxiliary_loss_mlp": 0.0111575, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00083256, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.5144302698819143, + "language_loss": 0.76189095, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78424323, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.654700756072998 + }, + { + "auxiliary_loss_clip": 0.01153084, + "auxiliary_loss_mlp": 0.00747827, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00112104, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 3.9374472944192407, + "language_loss": 0.75420445, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77321357, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 2.5487327575683594 + }, + { + "auxiliary_loss_clip": 0.01134394, + "auxiliary_loss_mlp": 0.01115895, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00069118, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.662448198593794, + "language_loss": 0.80820918, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83071214, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.5912418365478516 + }, + { + "auxiliary_loss_clip": 0.01137368, + "auxiliary_loss_mlp": 0.01115296, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00085533, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 2.045829155330165, + "language_loss": 0.6983856, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72091222, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.646153688430786 + }, + { + "auxiliary_loss_clip": 0.011678, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_clip": 1.00201571, + "balance_loss_mlp": 1.00067472, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 1.8619047436985743, + "language_loss": 0.83445066, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85727692, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.4905357360839844 + }, + { + "auxiliary_loss_clip": 0.01151156, + "auxiliary_loss_mlp": 0.00747832, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00098181, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.438369673941724, + "language_loss": 0.73363614, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.752626, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.51096773147583 + }, + { + "auxiliary_loss_clip": 0.01137684, + "auxiliary_loss_mlp": 0.01114472, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00069833, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 2.468529706932094, + "language_loss": 0.82854223, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85106379, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.5454137325286865 + }, + { + "auxiliary_loss_clip": 0.01119971, + "auxiliary_loss_mlp": 0.0111582, + "balance_loss_clip": 1.00195622, + "balance_loss_mlp": 1.00061595, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 1.8908542086267235, + "language_loss": 0.78771806, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.810076, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.628699779510498 + }, + { + "auxiliary_loss_clip": 0.01153058, + "auxiliary_loss_mlp": 0.01116808, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00074625, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.7752858961578557, + "language_loss": 0.76231873, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78501749, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 4.0171613693237305 + }, + { + "auxiliary_loss_clip": 0.01151031, + "auxiliary_loss_mlp": 0.01115449, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00062668, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.858398341283072, + "language_loss": 0.73007423, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75273901, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.5884909629821777 + }, + { + "auxiliary_loss_clip": 0.0116807, + "auxiliary_loss_mlp": 0.01117025, + "balance_loss_clip": 1.00199854, + "balance_loss_mlp": 1.00058174, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.8286599484956634, + "language_loss": 0.81808871, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84093964, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.5092926025390625 + }, + { + "auxiliary_loss_clip": 0.01114555, + "auxiliary_loss_mlp": 0.01096221, + "balance_loss_clip": 1.00141525, + "balance_loss_mlp": 1.00009108, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.76377104572637, + "language_loss": 0.55686867, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57897639, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.2234930992126465 + }, + { + "auxiliary_loss_clip": 0.01085446, + "auxiliary_loss_mlp": 0.01116242, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00075221, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.5729383253729412, + "language_loss": 0.73464292, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75665981, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 2.660726547241211 + }, + { + "auxiliary_loss_clip": 0.01136443, + "auxiliary_loss_mlp": 0.01116314, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00082445, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.509505641671329, + "language_loss": 0.71354413, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73607171, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 4.0186567306518555 + }, + { + "auxiliary_loss_clip": 0.01134652, + "auxiliary_loss_mlp": 0.01115698, + "balance_loss_clip": 1.0018419, + "balance_loss_mlp": 1.00058961, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6970561183385917, + "language_loss": 0.80923438, + "learning_rate": 1.948402052740906e-06, + "loss": 0.83173788, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.5680625438690186 + }, + { + "auxiliary_loss_clip": 0.01153089, + "auxiliary_loss_mlp": 0.01114708, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00074399, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.563511916249356, + "language_loss": 0.74373513, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76641315, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.5493125915527344 + }, + { + "auxiliary_loss_clip": 0.0115313, + "auxiliary_loss_mlp": 0.00747956, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00109887, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.4805968690627225, + "language_loss": 0.73198116, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75099206, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 4.025487422943115 + }, + { + "auxiliary_loss_clip": 0.01136613, + "auxiliary_loss_mlp": 0.01117091, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00074244, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.6876856607128201, + "language_loss": 0.66687512, + "learning_rate": 1.947234065463318e-06, + "loss": 0.68941212, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.604832887649536 + }, + { + "auxiliary_loss_clip": 0.01137288, + "auxiliary_loss_mlp": 0.00747914, + "balance_loss_clip": 1.00191891, + "balance_loss_mlp": 1.00109708, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 1.6172081318203515, + "language_loss": 0.66435969, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68321168, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.6445882320404053 + }, + { + "auxiliary_loss_clip": 0.01135657, + "auxiliary_loss_mlp": 0.01115926, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00072169, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 2.096968539288985, + "language_loss": 0.76236486, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78488076, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.570054769515991 + }, + { + "auxiliary_loss_clip": 0.01151301, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_clip": 1.00199044, + "balance_loss_mlp": 1.000772, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 1.9507339401517052, + "language_loss": 0.76538002, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78806615, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 4.1004416942596436 + }, + { + "auxiliary_loss_clip": 0.01135616, + "auxiliary_loss_mlp": 0.01115939, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.00073528, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.649270059899664, + "language_loss": 0.78086519, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80338073, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.5559628009796143 + }, + { + "auxiliary_loss_clip": 0.01136264, + "auxiliary_loss_mlp": 0.01117306, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.0006721, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.1399179922102145, + "language_loss": 0.70116758, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72370338, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.564565658569336 + }, + { + "auxiliary_loss_clip": 0.01147983, + "auxiliary_loss_mlp": 0.01096205, + "balance_loss_clip": 1.00139022, + "balance_loss_mlp": 1.00007427, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6778910637031127, + "language_loss": 0.52478075, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54722267, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.197387218475342 + }, + { + "auxiliary_loss_clip": 0.01136254, + "auxiliary_loss_mlp": 0.01116301, + "balance_loss_clip": 1.0018822, + "balance_loss_mlp": 1.00090587, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.6405805484546327, + "language_loss": 0.75112438, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.77364993, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.709355354309082 + }, + { + "auxiliary_loss_clip": 0.01134648, + "auxiliary_loss_mlp": 0.01114949, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00050831, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.556544342378983, + "language_loss": 0.77069801, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79319394, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.5813114643096924 + }, + { + "auxiliary_loss_clip": 0.01104774, + "auxiliary_loss_mlp": 0.01117595, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00077033, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 2.002862993770167, + "language_loss": 0.83322334, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85544705, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.681100845336914 + }, + { + "auxiliary_loss_clip": 0.01136027, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.00057554, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.7665522337059871, + "language_loss": 0.69507706, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71759796, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.603602170944214 + }, + { + "auxiliary_loss_clip": 0.01151329, + "auxiliary_loss_mlp": 0.01115907, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00060821, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.708780676387756, + "language_loss": 0.82986403, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85253644, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.557917356491089 + }, + { + "auxiliary_loss_clip": 0.01168082, + "auxiliary_loss_mlp": 0.01116316, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00073075, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7754407721688963, + "language_loss": 0.6975736, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72041762, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.5203356742858887 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01117245, + "balance_loss_clip": 1.00167751, + "balance_loss_mlp": 1.00061059, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.5015494597193317, + "language_loss": 0.77081609, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79318136, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.5815305709838867 + }, + { + "auxiliary_loss_clip": 0.01120634, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_clip": 1.00184131, + "balance_loss_mlp": 1.00050986, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.7787027329404488, + "language_loss": 0.75898945, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.7813701, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.577352285385132 + }, + { + "auxiliary_loss_clip": 0.01136244, + "auxiliary_loss_mlp": 0.01116145, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00075042, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.3603559457667747, + "language_loss": 0.71028376, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73280758, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.6538562774658203 + }, + { + "auxiliary_loss_clip": 0.01167978, + "auxiliary_loss_mlp": 0.01116024, + "balance_loss_clip": 1.00203633, + "balance_loss_mlp": 1.00082016, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 3.5017857667679113, + "language_loss": 0.86861789, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89145792, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.5013649463653564 + }, + { + "auxiliary_loss_clip": 0.01151633, + "auxiliary_loss_mlp": 0.01115678, + "balance_loss_clip": 1.0020411, + "balance_loss_mlp": 1.00056911, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 3.1963265963552323, + "language_loss": 0.61285526, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63552839, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.561413288116455 + }, + { + "auxiliary_loss_clip": 0.01136931, + "auxiliary_loss_mlp": 0.01116282, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00079203, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.9944152269383424, + "language_loss": 0.72082442, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74335653, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.6049249172210693 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01115527, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00060952, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.8338125431484766, + "language_loss": 0.73551178, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75817841, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.533217191696167 + }, + { + "auxiliary_loss_clip": 0.01153139, + "auxiliary_loss_mlp": 0.0111638, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00079489, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.4964889160139305, + "language_loss": 0.69971943, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72241461, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.6209118366241455 + }, + { + "auxiliary_loss_clip": 0.01104895, + "auxiliary_loss_mlp": 0.01115919, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00081098, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.5947498373845728, + "language_loss": 0.86401969, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88622785, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.6726341247558594 + }, + { + "auxiliary_loss_clip": 0.01167952, + "auxiliary_loss_mlp": 0.01116252, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00085723, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.5750966606256875, + "language_loss": 0.79959536, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82243741, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.479135751724243 + }, + { + "auxiliary_loss_clip": 0.01151635, + "auxiliary_loss_mlp": 0.01117462, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00082779, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.0620985003843177, + "language_loss": 0.74398392, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.76667494, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 2.5470616817474365 + }, + { + "auxiliary_loss_clip": 0.011681, + "auxiliary_loss_mlp": 0.01116642, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.0005796, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.6740421440504136, + "language_loss": 0.70405251, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72689992, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.572904586791992 + }, + { + "auxiliary_loss_clip": 0.01115845, + "auxiliary_loss_mlp": 0.01096479, + "balance_loss_clip": 1.00151145, + "balance_loss_mlp": 0.99996692, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7544144451580702, + "language_loss": 0.55652356, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57864678, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.1817288398742676 + }, + { + "auxiliary_loss_clip": 0.01131585, + "auxiliary_loss_mlp": 0.01095944, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00019491, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7932539700000365, + "language_loss": 0.58357, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60584533, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.1299965381622314 + }, + { + "auxiliary_loss_clip": 0.01152685, + "auxiliary_loss_mlp": 0.01117008, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00065994, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3251255207873693, + "language_loss": 0.70511615, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72781307, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.5828680992126465 + }, + { + "auxiliary_loss_clip": 0.01151171, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00062513, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.362648385223075, + "language_loss": 0.69455457, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71722448, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 4.0090718269348145 + }, + { + "auxiliary_loss_clip": 0.01120165, + "auxiliary_loss_mlp": 0.01116538, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.00066698, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.8667322563028281, + "language_loss": 0.84072274, + "learning_rate": 1.935944509558464e-06, + "loss": 0.8630898, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.675121784210205 + }, + { + "auxiliary_loss_clip": 0.01120988, + "auxiliary_loss_mlp": 0.01115734, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00062513, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.2100873323110393, + "language_loss": 0.79690599, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81927323, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.596792221069336 + }, + { + "auxiliary_loss_clip": 0.01151203, + "auxiliary_loss_mlp": 0.01115167, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00063062, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.6035487792713659, + "language_loss": 0.82823336, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85089707, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.573683023452759 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.01115229, + "balance_loss_clip": 1.00188649, + "balance_loss_mlp": 1.00059748, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.4649585800581542, + "language_loss": 0.77532238, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79798007, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.4946188926696777 + }, + { + "auxiliary_loss_clip": 0.01168163, + "auxiliary_loss_mlp": 0.01116844, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.0005914, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 2.499214706244492, + "language_loss": 0.82004237, + "learning_rate": 1.934387481628208e-06, + "loss": 0.84289241, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.4620308876037598 + }, + { + "auxiliary_loss_clip": 0.01134656, + "auxiliary_loss_mlp": 0.01115884, + "balance_loss_clip": 1.00179684, + "balance_loss_mlp": 1.00058508, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3365883433306986, + "language_loss": 0.76566136, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78816682, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 4.090827941894531 + }, + { + "auxiliary_loss_clip": 0.01151134, + "auxiliary_loss_mlp": 0.01115931, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00072765, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.4806413206892863, + "language_loss": 0.80216587, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82483649, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.595585823059082 + }, + { + "auxiliary_loss_clip": 0.0116803, + "auxiliary_loss_mlp": 0.01116542, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00057554, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 1.9627783319855618, + "language_loss": 0.69932485, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72217059, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.588474750518799 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01116068, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00067341, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4060736317411915, + "language_loss": 0.77303827, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79554558, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 4.0060389041900635 + }, + { + "auxiliary_loss_clip": 0.01131133, + "auxiliary_loss_mlp": 0.00746359, + "balance_loss_clip": 1.00158834, + "balance_loss_mlp": 1.00043643, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7439537648315497, + "language_loss": 0.54511505, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56388998, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.1825740337371826 + }, + { + "auxiliary_loss_clip": 0.01137514, + "auxiliary_loss_mlp": 0.01115497, + "balance_loss_clip": 1.0020448, + "balance_loss_mlp": 1.00077033, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.7494182347523526, + "language_loss": 0.84426779, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86679792, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.7103183269500732 + }, + { + "auxiliary_loss_clip": 0.01153099, + "auxiliary_loss_mlp": 0.00747961, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00101542, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.8480090338376483, + "language_loss": 0.70052761, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71953821, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.54705810546875 + }, + { + "auxiliary_loss_clip": 0.01136238, + "auxiliary_loss_mlp": 0.01116589, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00062203, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.7806979394774685, + "language_loss": 0.66005224, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68258047, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 4.074841022491455 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01117548, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00062728, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.4404369032357276, + "language_loss": 0.63073939, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65311116, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.5828232765197754 + }, + { + "auxiliary_loss_clip": 0.01149364, + "auxiliary_loss_mlp": 0.01096193, + "balance_loss_clip": 1.001459, + "balance_loss_mlp": 1.00006282, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7805288639151832, + "language_loss": 0.54173613, + "learning_rate": 1.930495088031323e-06, + "loss": 0.5641917, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.2188007831573486 + }, + { + "auxiliary_loss_clip": 0.01136454, + "auxiliary_loss_mlp": 0.01117075, + "balance_loss_clip": 1.00201476, + "balance_loss_mlp": 1.00063145, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.6452604499493853, + "language_loss": 0.75584406, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77837932, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.593273878097534 + }, + { + "auxiliary_loss_clip": 0.0115317, + "auxiliary_loss_mlp": 0.01115923, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.00071907, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.777521137924872, + "language_loss": 0.80926746, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83195841, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.525561571121216 + }, + { + "auxiliary_loss_clip": 0.01152456, + "auxiliary_loss_mlp": 0.01115688, + "balance_loss_clip": 1.00194538, + "balance_loss_mlp": 1.00077009, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 2.0888586408506318, + "language_loss": 0.75466174, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77734315, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.51994252204895 + }, + { + "auxiliary_loss_clip": 0.01088173, + "auxiliary_loss_mlp": 0.01114767, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00051641, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 1.7981017167353925, + "language_loss": 0.83106714, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85309649, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.6956074237823486 + }, + { + "auxiliary_loss_clip": 0.01138001, + "auxiliary_loss_mlp": 0.01116327, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00055051, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9197406016287826, + "language_loss": 0.80673754, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82928085, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.5755250453948975 + }, + { + "auxiliary_loss_clip": 0.01151292, + "auxiliary_loss_mlp": 0.0111672, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.0006578, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.6342427414804763, + "language_loss": 0.72187567, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74455583, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.621417760848999 + }, + { + "auxiliary_loss_clip": 0.01137491, + "auxiliary_loss_mlp": 0.01116359, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00067818, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 2.384563210616349, + "language_loss": 0.7648043, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78734279, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.584291458129883 + }, + { + "auxiliary_loss_clip": 0.01167896, + "auxiliary_loss_mlp": 0.01115769, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.00066018, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.3012640459749172, + "language_loss": 0.75736713, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78020382, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.524045944213867 + }, + { + "auxiliary_loss_clip": 0.01151328, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00065231, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.6671677678342365, + "language_loss": 0.68446952, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70714521, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.569730520248413 + }, + { + "auxiliary_loss_clip": 0.01152594, + "auxiliary_loss_mlp": 0.01116613, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00074124, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4908661732091382, + "language_loss": 0.83947152, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.8621636, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.536954402923584 + }, + { + "auxiliary_loss_clip": 0.01151131, + "auxiliary_loss_mlp": 0.01116731, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00076389, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.3362234469612617, + "language_loss": 0.8741982, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89687675, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.5718791484832764 + }, + { + "auxiliary_loss_clip": 0.01114081, + "auxiliary_loss_mlp": 0.01095069, + "balance_loss_clip": 1.00145459, + "balance_loss_mlp": 1.00008261, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7437266242130073, + "language_loss": 0.58819377, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.61028528, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.251249313354492 + }, + { + "auxiliary_loss_clip": 0.01121192, + "auxiliary_loss_mlp": 0.01116309, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00062823, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.783903187498632, + "language_loss": 0.70616269, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72853768, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.6159887313842773 + }, + { + "auxiliary_loss_clip": 0.01150932, + "auxiliary_loss_mlp": 0.01116131, + "balance_loss_clip": 1.00183904, + "balance_loss_mlp": 1.00073647, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.5692538644175393, + "language_loss": 0.87958753, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90225816, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.5044138431549072 + }, + { + "auxiliary_loss_clip": 0.0108763, + "auxiliary_loss_mlp": 0.0111647, + "balance_loss_clip": 1.00177157, + "balance_loss_mlp": 1.00088477, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.4654263668123655, + "language_loss": 0.7613889, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78342992, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.7474119663238525 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00054002, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.8151384288266177, + "language_loss": 0.71514285, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73764777, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.551730155944824 + }, + { + "auxiliary_loss_clip": 0.01134588, + "auxiliary_loss_mlp": 0.01116736, + "balance_loss_clip": 1.00163138, + "balance_loss_mlp": 1.00067341, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 3.762447102610677, + "language_loss": 0.7570498, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77956295, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.5760161876678467 + }, + { + "auxiliary_loss_clip": 0.0115291, + "auxiliary_loss_mlp": 0.00747777, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00097406, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.7132677621490444, + "language_loss": 0.70719737, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72620422, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.562743663787842 + }, + { + "auxiliary_loss_clip": 0.01132498, + "auxiliary_loss_mlp": 0.01095917, + "balance_loss_clip": 1.00146699, + "balance_loss_mlp": 1.00016844, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9134967091029987, + "language_loss": 0.65440208, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67668623, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.05936598777771 + }, + { + "auxiliary_loss_clip": 0.0115119, + "auxiliary_loss_mlp": 0.01116299, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.00052285, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6685920721040342, + "language_loss": 0.70904428, + "learning_rate": 1.922711106286265e-06, + "loss": 0.7317192, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.531642198562622 + }, + { + "auxiliary_loss_clip": 0.01123427, + "auxiliary_loss_mlp": 0.01116935, + "balance_loss_clip": 1.00200832, + "balance_loss_mlp": 1.00068164, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.5509394544581423, + "language_loss": 0.74394155, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76634514, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.614082098007202 + }, + { + "auxiliary_loss_clip": 0.01136184, + "auxiliary_loss_mlp": 0.01116654, + "balance_loss_clip": 1.00171435, + "balance_loss_mlp": 1.00059211, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.3897862449680352, + "language_loss": 0.85497713, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87750554, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.627216100692749 + }, + { + "auxiliary_loss_clip": 0.01168065, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00063658, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7889014672005983, + "language_loss": 0.7896719, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81251764, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 4.087080478668213 + }, + { + "auxiliary_loss_clip": 0.011523, + "auxiliary_loss_mlp": 0.01115947, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00055289, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.8117476475895766, + "language_loss": 0.73887652, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.76155895, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.5458693504333496 + }, + { + "auxiliary_loss_clip": 0.01137322, + "auxiliary_loss_mlp": 0.01115989, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00078547, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 1.8640226492763636, + "language_loss": 0.73999798, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76253116, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.558856248855591 + }, + { + "auxiliary_loss_clip": 0.01119015, + "auxiliary_loss_mlp": 0.01115989, + "balance_loss_clip": 1.00172496, + "balance_loss_mlp": 1.00068998, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.6485102779755307, + "language_loss": 0.73646903, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75881898, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.6219162940979004 + }, + { + "auxiliary_loss_clip": 0.01167867, + "auxiliary_loss_mlp": 0.01115986, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00049591, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.8520273633744957, + "language_loss": 0.68721962, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.71005809, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.543386459350586 + }, + { + "auxiliary_loss_clip": 0.01151126, + "auxiliary_loss_mlp": 0.01116714, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.0007472, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.6417996164617363, + "language_loss": 0.76449001, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78716844, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 3.959007978439331 + }, + { + "auxiliary_loss_clip": 0.0115338, + "auxiliary_loss_mlp": 0.01116722, + "balance_loss_clip": 1.00209939, + "balance_loss_mlp": 1.00094569, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.845216411374046, + "language_loss": 0.65473998, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.677441, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.563509941101074 + }, + { + "auxiliary_loss_clip": 0.01119462, + "auxiliary_loss_mlp": 0.01116032, + "balance_loss_clip": 1.00169516, + "balance_loss_mlp": 1.00073278, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.8452689241344824, + "language_loss": 0.85945702, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88181198, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.651210308074951 + }, + { + "auxiliary_loss_clip": 0.0113657, + "auxiliary_loss_mlp": 0.01115904, + "balance_loss_clip": 1.00194848, + "balance_loss_mlp": 1.00050914, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.4662642513105262, + "language_loss": 0.79994625, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82247096, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.587946891784668 + }, + { + "auxiliary_loss_clip": 0.01137655, + "auxiliary_loss_mlp": 0.01115386, + "balance_loss_clip": 1.00183666, + "balance_loss_mlp": 1.00075471, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.6024836138688032, + "language_loss": 0.83843875, + "learning_rate": 1.918041272397012e-06, + "loss": 0.86096913, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 4.078732490539551 + }, + { + "auxiliary_loss_clip": 0.01134653, + "auxiliary_loss_mlp": 0.01116282, + "balance_loss_clip": 1.00178063, + "balance_loss_mlp": 1.0006969, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.5903239973249341, + "language_loss": 0.67730248, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.69981182, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.549495220184326 + }, + { + "auxiliary_loss_clip": 0.0113797, + "auxiliary_loss_mlp": 0.01115767, + "balance_loss_clip": 1.0024302, + "balance_loss_mlp": 1.00075364, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.5113022675940264, + "language_loss": 0.825647, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84818435, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.5847623348236084 + }, + { + "auxiliary_loss_clip": 0.01150836, + "auxiliary_loss_mlp": 0.0111721, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00067091, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 1.871837689674753, + "language_loss": 0.79926699, + "learning_rate": 1.916873882856013e-06, + "loss": 0.82194746, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.553380250930786 + }, + { + "auxiliary_loss_clip": 0.01151028, + "auxiliary_loss_mlp": 0.01115095, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00055838, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.3170170630171674, + "language_loss": 0.76386464, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78652585, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 3.9346485137939453 + }, + { + "auxiliary_loss_clip": 0.01118756, + "auxiliary_loss_mlp": 0.01116928, + "balance_loss_clip": 1.00188291, + "balance_loss_mlp": 1.00048399, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.378445854964839, + "language_loss": 0.69509912, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71745598, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.738845109939575 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01115055, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00061393, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.539364403709672, + "language_loss": 0.72135115, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74400979, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 2.548842668533325 + }, + { + "auxiliary_loss_clip": 0.01134013, + "auxiliary_loss_mlp": 0.01115873, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00047851, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.7610738269158415, + "language_loss": 0.68679762, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70929646, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.5832157135009766 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01118563, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00068891, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.670592558681416, + "language_loss": 0.68905842, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71177638, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.630701780319214 + }, + { + "auxiliary_loss_clip": 0.01168017, + "auxiliary_loss_mlp": 0.01117509, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00049305, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.2782758854164675, + "language_loss": 0.74994874, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77280396, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.53427791595459 + }, + { + "auxiliary_loss_clip": 0.01151268, + "auxiliary_loss_mlp": 0.01116256, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00067043, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.69177557514822, + "language_loss": 0.83187568, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85455096, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.530966281890869 + }, + { + "auxiliary_loss_clip": 0.01118657, + "auxiliary_loss_mlp": 0.01114911, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00056565, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.0560283680194362, + "language_loss": 0.82734549, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84968126, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.624089479446411 + }, + { + "auxiliary_loss_clip": 0.01103175, + "auxiliary_loss_mlp": 0.01115521, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00050759, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.6361028508151323, + "language_loss": 0.83207667, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85426354, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.6649739742279053 + }, + { + "auxiliary_loss_clip": 0.01119224, + "auxiliary_loss_mlp": 0.01115888, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00077939, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.6455255475613046, + "language_loss": 0.75057375, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77292496, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.705461025238037 + }, + { + "auxiliary_loss_clip": 0.0115106, + "auxiliary_loss_mlp": 0.011171, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00065613, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.4280251034831493, + "language_loss": 0.70068848, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.72337008, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.6083598136901855 + }, + { + "auxiliary_loss_clip": 0.01167788, + "auxiliary_loss_mlp": 0.01115794, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00049436, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5921487971078452, + "language_loss": 0.78745258, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81028843, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.5056445598602295 + }, + { + "auxiliary_loss_clip": 0.01104371, + "auxiliary_loss_mlp": 0.01115714, + "balance_loss_clip": 1.00238061, + "balance_loss_mlp": 1.00070119, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 3.1471542162517196, + "language_loss": 0.66139996, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68360078, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.638123035430908 + }, + { + "auxiliary_loss_clip": 0.01135891, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00054586, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.8707839478721846, + "language_loss": 0.79868054, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82120168, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.6052112579345703 + }, + { + "auxiliary_loss_clip": 0.01167859, + "auxiliary_loss_mlp": 0.01116131, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00064111, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 1.839220511183021, + "language_loss": 0.84579837, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86863828, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.4795258045196533 + }, + { + "auxiliary_loss_clip": 0.01136507, + "auxiliary_loss_mlp": 0.01116866, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00070798, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.9580032761815256, + "language_loss": 0.67359591, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69612962, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.5800771713256836 + }, + { + "auxiliary_loss_clip": 0.01134973, + "auxiliary_loss_mlp": 0.01116015, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.00052476, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.897344124118333, + "language_loss": 0.80636692, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82887685, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.569361686706543 + }, + { + "auxiliary_loss_clip": 0.01122813, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00054657, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.9576274123419168, + "language_loss": 0.69065189, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71304226, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.6206486225128174 + }, + { + "auxiliary_loss_clip": 0.01151227, + "auxiliary_loss_mlp": 0.01115577, + "balance_loss_clip": 1.00200021, + "balance_loss_mlp": 1.00065935, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.479391194480688, + "language_loss": 0.82283306, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84550107, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.5210320949554443 + }, + { + "auxiliary_loss_clip": 0.01136037, + "auxiliary_loss_mlp": 0.00747765, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00085473, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.8191840974776898, + "language_loss": 0.70492685, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.7237649, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.586033821105957 + }, + { + "auxiliary_loss_clip": 0.01151267, + "auxiliary_loss_mlp": 0.01115799, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00059557, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8726965540443945, + "language_loss": 0.69424379, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71691442, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.502446174621582 + }, + { + "auxiliary_loss_clip": 0.01117451, + "auxiliary_loss_mlp": 0.01095809, + "balance_loss_clip": 1.00145602, + "balance_loss_mlp": 1.00005996, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9338321581309736, + "language_loss": 0.56934297, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59147555, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.0897064208984375 + }, + { + "auxiliary_loss_clip": 0.01137351, + "auxiliary_loss_mlp": 0.01115504, + "balance_loss_clip": 1.00203717, + "balance_loss_mlp": 1.00068188, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5117236345040106, + "language_loss": 0.6394276, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66195619, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.6359620094299316 + }, + { + "auxiliary_loss_clip": 0.0113604, + "auxiliary_loss_mlp": 0.01115805, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.0004108, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.5588022342396435, + "language_loss": 0.68704408, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70956254, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.764850616455078 + }, + { + "auxiliary_loss_clip": 0.01151269, + "auxiliary_loss_mlp": 0.00747901, + "balance_loss_clip": 1.00198877, + "balance_loss_mlp": 1.00096869, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6347728747856136, + "language_loss": 0.75972772, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.77871943, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.5610508918762207 + }, + { + "auxiliary_loss_clip": 0.01149469, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_clip": 1.0014441, + "balance_loss_mlp": 1.00000632, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7549032043517362, + "language_loss": 0.52951217, + "learning_rate": 1.906757737841291e-06, + "loss": 0.5519644, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 4.75393271446228 + }, + { + "auxiliary_loss_clip": 0.01149336, + "auxiliary_loss_mlp": 0.01096153, + "balance_loss_clip": 1.00142765, + "balance_loss_mlp": 1.00002241, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7409496283166253, + "language_loss": 0.63788247, + "learning_rate": 1.906368701413693e-06, + "loss": 0.66033733, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.1105782985687256 + }, + { + "auxiliary_loss_clip": 0.01151468, + "auxiliary_loss_mlp": 0.01116591, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00071979, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.5133570807932297, + "language_loss": 0.7198199, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74250048, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.526048183441162 + }, + { + "auxiliary_loss_clip": 0.01118471, + "auxiliary_loss_mlp": 0.01115386, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00056314, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.1669310376981703, + "language_loss": 0.69338089, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71571946, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.571937322616577 + }, + { + "auxiliary_loss_clip": 0.01150999, + "auxiliary_loss_mlp": 0.01114687, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00053191, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 2.1993806773726936, + "language_loss": 0.86808163, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.89073849, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.494856119155884 + }, + { + "auxiliary_loss_clip": 0.01151287, + "auxiliary_loss_mlp": 0.01117342, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.00061238, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.568240281067788, + "language_loss": 0.64298964, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66567588, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.7053096294403076 + }, + { + "auxiliary_loss_clip": 0.01167814, + "auxiliary_loss_mlp": 0.01115234, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.0006026, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5274226917959888, + "language_loss": 0.68045193, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70328236, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 3.995957374572754 + }, + { + "auxiliary_loss_clip": 0.01102986, + "auxiliary_loss_mlp": 0.01095765, + "balance_loss_clip": 1.00132465, + "balance_loss_mlp": 1.00001562, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.673980191835178, + "language_loss": 0.5341621, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55614966, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.436002492904663 + }, + { + "auxiliary_loss_clip": 0.01130481, + "auxiliary_loss_mlp": 0.01095739, + "balance_loss_clip": 1.00129378, + "balance_loss_mlp": 0.99999028, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7246464878142855, + "language_loss": 0.5633108, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58557302, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.447265863418579 + }, + { + "auxiliary_loss_clip": 0.01104964, + "auxiliary_loss_mlp": 0.01114483, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00061476, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 4.294999061797863, + "language_loss": 0.81562996, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83782446, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 4.2242958545684814 + }, + { + "auxiliary_loss_clip": 0.0116801, + "auxiliary_loss_mlp": 0.01116564, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00059724, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.4948220571780297, + "language_loss": 0.85125101, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87409663, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.5503592491149902 + }, + { + "auxiliary_loss_clip": 0.01167929, + "auxiliary_loss_mlp": 0.01115474, + "balance_loss_clip": 1.00212789, + "balance_loss_mlp": 1.00055611, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.5881399371995895, + "language_loss": 0.66145742, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68429148, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.599702835083008 + }, + { + "auxiliary_loss_clip": 0.01136219, + "auxiliary_loss_mlp": 0.01115025, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00058424, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.5945283316944896, + "language_loss": 0.720424, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74293643, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 4.225574970245361 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01116015, + "balance_loss_clip": 1.00182545, + "balance_loss_mlp": 1.00052464, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6806553796529025, + "language_loss": 0.65350741, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67602617, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.6835532188415527 + }, + { + "auxiliary_loss_clip": 0.01105284, + "auxiliary_loss_mlp": 0.01115981, + "balance_loss_clip": 1.00171232, + "balance_loss_mlp": 1.00058651, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 2.679376214250973, + "language_loss": 0.7465359, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.76874852, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.6391963958740234 + }, + { + "auxiliary_loss_clip": 0.01119041, + "auxiliary_loss_mlp": 0.01116509, + "balance_loss_clip": 1.00170028, + "balance_loss_mlp": 1.00054264, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.0887109360196527, + "language_loss": 0.82027864, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.8426342, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.5916244983673096 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.0111491, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.0006603, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.4928530996807101, + "language_loss": 0.72213769, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.7446444, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.617185115814209 + }, + { + "auxiliary_loss_clip": 0.01134942, + "auxiliary_loss_mlp": 0.01115802, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00040698, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.373664437100395, + "language_loss": 0.74256837, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76507586, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.6213667392730713 + }, + { + "auxiliary_loss_clip": 0.01120909, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00060475, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6358607542471673, + "language_loss": 0.67532396, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69769979, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.691459894180298 + }, + { + "auxiliary_loss_clip": 0.01167833, + "auxiliary_loss_mlp": 0.01116014, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00061917, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.4792336322485282, + "language_loss": 0.69327855, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71611702, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.5086753368377686 + }, + { + "auxiliary_loss_clip": 0.01134336, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00076914, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 1.773616236847779, + "language_loss": 0.76229203, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78111261, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.5663819313049316 + }, + { + "auxiliary_loss_clip": 0.01070621, + "auxiliary_loss_mlp": 0.01115032, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00068605, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.8742184890525184, + "language_loss": 0.85617077, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87802732, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.710272789001465 + }, + { + "auxiliary_loss_clip": 0.01167707, + "auxiliary_loss_mlp": 0.01115557, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00073481, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.4871839131219586, + "language_loss": 0.64610374, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66893637, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.489837646484375 + }, + { + "auxiliary_loss_clip": 0.01135657, + "auxiliary_loss_mlp": 0.01116488, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00071216, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.7607400693604494, + "language_loss": 0.59833539, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62085676, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.7701404094696045 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01116236, + "balance_loss_clip": 1.00203311, + "balance_loss_mlp": 1.00045979, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.720223661516364, + "language_loss": 0.81292784, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83561987, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.548051595687866 + }, + { + "auxiliary_loss_clip": 0.0115128, + "auxiliary_loss_mlp": 0.01115189, + "balance_loss_clip": 1.00194752, + "balance_loss_mlp": 1.00046158, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.40057628589685, + "language_loss": 0.78391373, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.8065784, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.555546998977661 + }, + { + "auxiliary_loss_clip": 0.01150976, + "auxiliary_loss_mlp": 0.01114823, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.00038254, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 1.9287319891491372, + "language_loss": 0.81097478, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83363277, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.500631093978882 + }, + { + "auxiliary_loss_clip": 0.01151072, + "auxiliary_loss_mlp": 0.01115789, + "balance_loss_clip": 1.00185692, + "balance_loss_mlp": 1.00049019, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 1.8155362332717702, + "language_loss": 0.73594046, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75860906, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.542179584503174 + }, + { + "auxiliary_loss_clip": 0.01121381, + "auxiliary_loss_mlp": 0.0111725, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00061536, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.8734684481104917, + "language_loss": 0.7543509, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77673721, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.622948408126831 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01114713, + "balance_loss_clip": 1.00163782, + "balance_loss_mlp": 1.00046241, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.727643982759906, + "language_loss": 0.73288184, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75521743, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.661571979522705 + }, + { + "auxiliary_loss_clip": 0.01167991, + "auxiliary_loss_mlp": 0.01117114, + "balance_loss_clip": 1.00202632, + "balance_loss_mlp": 1.00057507, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.7193063701072882, + "language_loss": 0.77330917, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79616022, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.5426974296569824 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01114457, + "balance_loss_clip": 1.00166154, + "balance_loss_mlp": 1.00058866, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.5900046816770448, + "language_loss": 0.72429353, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74678379, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.5934152603149414 + }, + { + "auxiliary_loss_clip": 0.01134612, + "auxiliary_loss_mlp": 0.011153, + "balance_loss_clip": 1.00181723, + "balance_loss_mlp": 1.00047779, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.5622184550033809, + "language_loss": 0.80484873, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82734787, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.5728085041046143 + }, + { + "auxiliary_loss_clip": 0.01151245, + "auxiliary_loss_mlp": 0.01115347, + "balance_loss_clip": 1.00185812, + "balance_loss_mlp": 1.00042939, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 2.9872275685305247, + "language_loss": 0.85805583, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88072181, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.549926280975342 + }, + { + "auxiliary_loss_clip": 0.01134331, + "auxiliary_loss_mlp": 0.01115629, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00052023, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.7291936192071005, + "language_loss": 0.72958899, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75208855, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.5840492248535156 + }, + { + "auxiliary_loss_clip": 0.01135581, + "auxiliary_loss_mlp": 0.01115056, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00051951, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.5680404926688563, + "language_loss": 0.76856607, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79107243, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.6148440837860107 + }, + { + "auxiliary_loss_clip": 0.01117216, + "auxiliary_loss_mlp": 0.01115229, + "balance_loss_clip": 1.00169277, + "balance_loss_mlp": 1.00059772, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.0044349693269754, + "language_loss": 0.77472597, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79705048, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.5980329513549805 + }, + { + "auxiliary_loss_clip": 0.01131485, + "auxiliary_loss_mlp": 0.01095433, + "balance_loss_clip": 1.00146532, + "balance_loss_mlp": 1.00006545, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6884393647833795, + "language_loss": 0.56739408, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58966327, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 4.6854448318481445 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01116591, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00062394, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.750541135651384, + "language_loss": 0.7363649, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75887257, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.548112154006958 + }, + { + "auxiliary_loss_clip": 0.01130294, + "auxiliary_loss_mlp": 0.01095319, + "balance_loss_clip": 1.00154722, + "balance_loss_mlp": 0.99995154, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.87823990315947, + "language_loss": 0.61006194, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63231808, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.186157464981079 + }, + { + "auxiliary_loss_clip": 0.01149327, + "auxiliary_loss_mlp": 0.01095387, + "balance_loss_clip": 1.00147378, + "balance_loss_mlp": 1.00001931, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8349338575179943, + "language_loss": 0.62167758, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64412469, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.096282720565796 + }, + { + "auxiliary_loss_clip": 0.0111966, + "auxiliary_loss_mlp": 0.01115494, + "balance_loss_clip": 1.0017364, + "balance_loss_mlp": 1.00048089, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.080202129779031, + "language_loss": 0.75427407, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77662557, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.6082184314727783 + }, + { + "auxiliary_loss_clip": 0.01150951, + "auxiliary_loss_mlp": 0.01115159, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00071836, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.4741895877766946, + "language_loss": 0.75584495, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77850604, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.5846173763275146 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01114743, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00049257, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 2.11689799156903, + "language_loss": 0.87809211, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90059477, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 4.049814224243164 + }, + { + "auxiliary_loss_clip": 0.01121554, + "auxiliary_loss_mlp": 0.01115607, + "balance_loss_clip": 1.00198281, + "balance_loss_mlp": 1.00059426, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 1.7512340382993525, + "language_loss": 0.74211621, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76448786, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.6105730533599854 + }, + { + "auxiliary_loss_clip": 0.0115291, + "auxiliary_loss_mlp": 0.01116137, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00045669, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.708168556029126, + "language_loss": 0.79607201, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81876248, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.5699450969696045 + }, + { + "auxiliary_loss_clip": 0.01167729, + "auxiliary_loss_mlp": 0.01114712, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00055766, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.359849929137717, + "language_loss": 0.54851991, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57134426, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 4.013113021850586 + }, + { + "auxiliary_loss_clip": 0.01152918, + "auxiliary_loss_mlp": 0.01115222, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00058985, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.527441708305612, + "language_loss": 0.68794441, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.71062577, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.5363495349884033 + }, + { + "auxiliary_loss_clip": 0.0113227, + "auxiliary_loss_mlp": 0.007461, + "balance_loss_clip": 1.00144327, + "balance_loss_mlp": 1.00012493, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8027029990318459, + "language_loss": 0.62784153, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64662522, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.1231627464294434 + }, + { + "auxiliary_loss_clip": 0.01152591, + "auxiliary_loss_mlp": 0.01115439, + "balance_loss_clip": 1.00196385, + "balance_loss_mlp": 1.00061703, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.1737724906618343, + "language_loss": 0.80097365, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.82365394, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.5108466148376465 + }, + { + "auxiliary_loss_clip": 0.01136482, + "auxiliary_loss_mlp": 0.01113645, + "balance_loss_clip": 1.00200999, + "balance_loss_mlp": 1.00053883, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6317945554814866, + "language_loss": 0.73225176, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75475299, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 4.077371597290039 + }, + { + "auxiliary_loss_clip": 0.01135925, + "auxiliary_loss_mlp": 0.00747713, + "balance_loss_clip": 1.00176728, + "balance_loss_mlp": 1.00088954, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.8390561382986021, + "language_loss": 0.64622176, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66505814, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.6208839416503906 + }, + { + "auxiliary_loss_clip": 0.01135831, + "auxiliary_loss_mlp": 0.01115741, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00063229, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 2.1324215420210084, + "language_loss": 0.7767694, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79928505, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.645406723022461 + }, + { + "auxiliary_loss_clip": 0.01123064, + "auxiliary_loss_mlp": 0.01114713, + "balance_loss_clip": 1.00185406, + "balance_loss_mlp": 1.00055814, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 1.6501067409658514, + "language_loss": 0.71083719, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73321497, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.6505093574523926 + }, + { + "auxiliary_loss_clip": 0.0115289, + "auxiliary_loss_mlp": 0.01115623, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00070536, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 2.1590090991574837, + "language_loss": 0.69615149, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71883655, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.578488349914551 + }, + { + "auxiliary_loss_clip": 0.01150957, + "auxiliary_loss_mlp": 0.01112711, + "balance_loss_clip": 1.00193202, + "balance_loss_mlp": 1.0003686, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.462016298463763, + "language_loss": 0.69888192, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.72151858, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.601227045059204 + }, + { + "auxiliary_loss_clip": 0.01135269, + "auxiliary_loss_mlp": 0.01114232, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00045824, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.7253762204257983, + "language_loss": 0.78017545, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80267048, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.598550319671631 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01115062, + "balance_loss_clip": 1.00166011, + "balance_loss_mlp": 1.00071669, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.4491415221528758, + "language_loss": 0.85814106, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88032407, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.699714183807373 + }, + { + "auxiliary_loss_clip": 0.01135965, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00053251, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.2280779043957946, + "language_loss": 0.62071002, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.6432299, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.5560362339019775 + }, + { + "auxiliary_loss_clip": 0.01136271, + "auxiliary_loss_mlp": 0.01114608, + "balance_loss_clip": 1.00205195, + "balance_loss_mlp": 1.00064433, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.8357791530253957, + "language_loss": 0.73869765, + "learning_rate": 1.883811143046377e-06, + "loss": 0.76120645, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.6140646934509277 + }, + { + "auxiliary_loss_clip": 0.0116778, + "auxiliary_loss_mlp": 0.01115182, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00083649, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.5908401160326466, + "language_loss": 0.64176118, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66459078, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.5447964668273926 + }, + { + "auxiliary_loss_clip": 0.01152473, + "auxiliary_loss_mlp": 0.01115122, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00058556, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 2.46212282235822, + "language_loss": 0.77988464, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80256057, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.5490336418151855 + }, + { + "auxiliary_loss_clip": 0.01151114, + "auxiliary_loss_mlp": 0.01114589, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00043452, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 1.8449501059104838, + "language_loss": 0.73477912, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75743616, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.525787591934204 + }, + { + "auxiliary_loss_clip": 0.01136263, + "auxiliary_loss_mlp": 0.01115557, + "balance_loss_clip": 1.00182211, + "balance_loss_mlp": 1.00054407, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.5240013537736992, + "language_loss": 0.72161603, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74413419, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.739380359649658 + }, + { + "auxiliary_loss_clip": 0.01121279, + "auxiliary_loss_mlp": 0.01115579, + "balance_loss_clip": 1.00177193, + "balance_loss_mlp": 1.00066149, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.7397862902105248, + "language_loss": 0.77738208, + "learning_rate": 1.881867178843637e-06, + "loss": 0.79975069, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.6397273540496826 + }, + { + "auxiliary_loss_clip": 0.01151116, + "auxiliary_loss_mlp": 0.0111558, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00056732, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 2.20130511510991, + "language_loss": 0.75680923, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77947623, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.516808271408081 + }, + { + "auxiliary_loss_clip": 0.01134664, + "auxiliary_loss_mlp": 0.01116334, + "balance_loss_clip": 1.00194943, + "balance_loss_mlp": 1.00055778, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 2.2414722704305756, + "language_loss": 0.75588715, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77839714, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.6077871322631836 + }, + { + "auxiliary_loss_clip": 0.01133739, + "auxiliary_loss_mlp": 0.01114554, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00058949, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.9534192460051643, + "language_loss": 0.72072983, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.7432127, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.5527002811431885 + }, + { + "auxiliary_loss_clip": 0.0113513, + "auxiliary_loss_mlp": 0.01115367, + "balance_loss_clip": 1.00216627, + "balance_loss_mlp": 1.00064015, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 2.1630864191088457, + "language_loss": 0.64652032, + "learning_rate": 1.880312088025936e-06, + "loss": 0.6690253, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.566058397293091 + }, + { + "auxiliary_loss_clip": 0.01134027, + "auxiliary_loss_mlp": 0.01114466, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00069284, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 1.9982180555577687, + "language_loss": 0.80561292, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82809782, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.564995050430298 + }, + { + "auxiliary_loss_clip": 0.01152405, + "auxiliary_loss_mlp": 0.01114978, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.00053704, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.7250959649051048, + "language_loss": 0.69594121, + "learning_rate": 1.879534569789582e-06, + "loss": 0.718615, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.5547726154327393 + }, + { + "auxiliary_loss_clip": 0.01164032, + "auxiliary_loss_mlp": 0.01095392, + "balance_loss_clip": 1.00144839, + "balance_loss_mlp": 1.0000248, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7258751483556962, + "language_loss": 0.59673941, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61933362, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.203714609146118 + }, + { + "auxiliary_loss_clip": 0.01151386, + "auxiliary_loss_mlp": 0.0111461, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00055027, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.7229637468113355, + "language_loss": 0.74820864, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77086866, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 2.5534613132476807 + }, + { + "auxiliary_loss_clip": 0.0114718, + "auxiliary_loss_mlp": 0.01095438, + "balance_loss_clip": 1.00158906, + "balance_loss_mlp": 1.00007081, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7519739503125257, + "language_loss": 0.57170868, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59413481, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.0644476413726807 + }, + { + "auxiliary_loss_clip": 0.0116797, + "auxiliary_loss_mlp": 0.01115633, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00061941, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.5336086693114184, + "language_loss": 0.7225064, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74534249, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.557133674621582 + }, + { + "auxiliary_loss_clip": 0.01167977, + "auxiliary_loss_mlp": 0.0111553, + "balance_loss_clip": 1.00219989, + "balance_loss_mlp": 1.00061214, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.179630068466534, + "language_loss": 0.83640248, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85923755, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 3.9621574878692627 + }, + { + "auxiliary_loss_clip": 0.01087772, + "auxiliary_loss_mlp": 0.01114027, + "balance_loss_clip": 1.0017885, + "balance_loss_mlp": 1.00054026, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.4571733245995073, + "language_loss": 0.79176247, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81378043, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.7062175273895264 + }, + { + "auxiliary_loss_clip": 0.01130052, + "auxiliary_loss_mlp": 0.0109536, + "balance_loss_clip": 1.00166297, + "balance_loss_mlp": 0.99999219, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7923791293740269, + "language_loss": 0.59261215, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61486626, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.062650680541992 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_clip": 1.00145197, + "balance_loss_mlp": 1.00006104, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8639621515876015, + "language_loss": 0.63685727, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65913564, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 2.9385836124420166 + }, + { + "auxiliary_loss_clip": 0.01104551, + "auxiliary_loss_mlp": 0.01115769, + "balance_loss_clip": 1.00176394, + "balance_loss_mlp": 1.00066042, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 1.9972909249744664, + "language_loss": 0.81964791, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84185112, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.7195069789886475 + }, + { + "auxiliary_loss_clip": 0.01138265, + "auxiliary_loss_mlp": 0.01114087, + "balance_loss_clip": 1.00217187, + "balance_loss_mlp": 1.0006001, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.6978295876082043, + "language_loss": 0.72465903, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74718255, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.577744245529175 + }, + { + "auxiliary_loss_clip": 0.01135827, + "auxiliary_loss_mlp": 0.01115771, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00056696, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 4.282421953150746, + "language_loss": 0.78760743, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81012332, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 3.9785945415496826 + }, + { + "auxiliary_loss_clip": 0.01117554, + "auxiliary_loss_mlp": 0.00747815, + "balance_loss_clip": 1.00171077, + "balance_loss_mlp": 1.00092161, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.7990679238022105, + "language_loss": 0.74888098, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76753461, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.722454786300659 + }, + { + "auxiliary_loss_clip": 0.01136163, + "auxiliary_loss_mlp": 0.01114169, + "balance_loss_clip": 1.00188959, + "balance_loss_mlp": 1.00049138, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.114989850818332, + "language_loss": 0.69207525, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71457851, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.543888807296753 + }, + { + "auxiliary_loss_clip": 0.01153233, + "auxiliary_loss_mlp": 0.01115439, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00071192, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 1.779562451417911, + "language_loss": 0.77562535, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79831201, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 4.015882730484009 + }, + { + "auxiliary_loss_clip": 0.01167834, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00072086, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 2.1109104908804195, + "language_loss": 0.69625193, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71908092, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.4816174507141113 + }, + { + "auxiliary_loss_clip": 0.01167959, + "auxiliary_loss_mlp": 0.01115928, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00072467, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.1537166463991086, + "language_loss": 0.76284158, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.78568041, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.522357225418091 + }, + { + "auxiliary_loss_clip": 0.0115262, + "auxiliary_loss_mlp": 0.01113972, + "balance_loss_clip": 1.00201595, + "balance_loss_mlp": 1.0006752, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5293524798697604, + "language_loss": 0.74221873, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76488459, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.5523879528045654 + }, + { + "auxiliary_loss_clip": 0.01137184, + "auxiliary_loss_mlp": 0.01114361, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00049233, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.413035925918613, + "language_loss": 0.87734163, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.8998571, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 4.024052143096924 + }, + { + "auxiliary_loss_clip": 0.01167745, + "auxiliary_loss_mlp": 0.0111408, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00059319, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.5271140128516194, + "language_loss": 0.7293269, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75214517, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.5050528049468994 + }, + { + "auxiliary_loss_clip": 0.01152336, + "auxiliary_loss_mlp": 0.01113664, + "balance_loss_clip": 1.0019294, + "balance_loss_mlp": 1.00055766, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 2.936349993376804, + "language_loss": 0.75114274, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77380276, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.551151990890503 + }, + { + "auxiliary_loss_clip": 0.01136762, + "auxiliary_loss_mlp": 0.01114019, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00053144, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.6268135419399092, + "language_loss": 0.77005345, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79256123, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.585618019104004 + }, + { + "auxiliary_loss_clip": 0.01134529, + "auxiliary_loss_mlp": 0.01113369, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00045419, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.8957149653968535, + "language_loss": 0.7852633, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80774236, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.555634021759033 + }, + { + "auxiliary_loss_clip": 0.01151012, + "auxiliary_loss_mlp": 0.01114616, + "balance_loss_clip": 1.00185311, + "balance_loss_mlp": 1.00065219, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 2.2250223159751497, + "language_loss": 0.75784957, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.7805059, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.5306737422943115 + }, + { + "auxiliary_loss_clip": 0.01149594, + "auxiliary_loss_mlp": 0.01094764, + "balance_loss_clip": 1.0015316, + "balance_loss_mlp": 1.00015914, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8440657945630489, + "language_loss": 0.57966995, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.6021136, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.300938129425049 + }, + { + "auxiliary_loss_clip": 0.01134443, + "auxiliary_loss_mlp": 0.01113493, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.5113388876837843, + "language_loss": 0.70119262, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72367203, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.6254942417144775 + }, + { + "auxiliary_loss_clip": 0.01136319, + "auxiliary_loss_mlp": 0.01114505, + "balance_loss_clip": 1.00191057, + "balance_loss_mlp": 1.00054049, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 3.2866795880840662, + "language_loss": 0.71489263, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73740083, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.5803287029266357 + }, + { + "auxiliary_loss_clip": 0.01122522, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00050795, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 1.6797444390292278, + "language_loss": 0.77554107, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79791957, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.6063194274902344 + }, + { + "auxiliary_loss_clip": 0.01136102, + "auxiliary_loss_mlp": 0.0111444, + "balance_loss_clip": 1.00171351, + "balance_loss_mlp": 1.00047588, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.3109735387704158, + "language_loss": 0.69704521, + "learning_rate": 1.868651286721281e-06, + "loss": 0.71955067, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.599151611328125 + }, + { + "auxiliary_loss_clip": 0.01151144, + "auxiliary_loss_mlp": 0.00747629, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00072646, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.490273360093646, + "language_loss": 0.72303784, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74202561, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.5926222801208496 + }, + { + "auxiliary_loss_clip": 0.01134331, + "auxiliary_loss_mlp": 0.0111509, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00064909, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7187722026484658, + "language_loss": 0.73373854, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.7562328, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.5825657844543457 + }, + { + "auxiliary_loss_clip": 0.01150866, + "auxiliary_loss_mlp": 0.01114005, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00061285, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.5631668156781868, + "language_loss": 0.83339441, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.8560431, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.566483974456787 + }, + { + "auxiliary_loss_clip": 0.01151186, + "auxiliary_loss_mlp": 0.00747792, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00082278, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 9.559364931794343, + "language_loss": 0.73946422, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75845397, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.557147741317749 + }, + { + "auxiliary_loss_clip": 0.01152755, + "auxiliary_loss_mlp": 0.01114336, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00056267, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.6292675336701508, + "language_loss": 0.76502329, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78769416, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.560267448425293 + }, + { + "auxiliary_loss_clip": 0.01136786, + "auxiliary_loss_mlp": 0.00747631, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00073659, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 1.8707722445716424, + "language_loss": 0.74107546, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.75991964, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.575935125350952 + }, + { + "auxiliary_loss_clip": 0.01121414, + "auxiliary_loss_mlp": 0.01114231, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00074387, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 2.006450369116111, + "language_loss": 0.83792108, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86027753, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.6049768924713135 + }, + { + "auxiliary_loss_clip": 0.01136022, + "auxiliary_loss_mlp": 0.01114228, + "balance_loss_clip": 1.00177014, + "balance_loss_mlp": 1.0004549, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.4609686443476013, + "language_loss": 0.81819522, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84069765, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.6256649494171143 + }, + { + "auxiliary_loss_clip": 0.01119063, + "auxiliary_loss_mlp": 0.01114314, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00063634, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.75712317516219, + "language_loss": 0.69055861, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71289235, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.6259546279907227 + }, + { + "auxiliary_loss_clip": 0.01135554, + "auxiliary_loss_mlp": 0.01114812, + "balance_loss_clip": 1.00192392, + "balance_loss_mlp": 1.00075293, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.7405917272652447, + "language_loss": 0.71334589, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73584962, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.5644726753234863 + }, + { + "auxiliary_loss_clip": 0.01119127, + "auxiliary_loss_mlp": 0.01114594, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00082088, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 2.382761873338194, + "language_loss": 0.72507739, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74741459, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.6146750450134277 + }, + { + "auxiliary_loss_clip": 0.01135183, + "auxiliary_loss_mlp": 0.01115895, + "balance_loss_clip": 1.00204909, + "balance_loss_mlp": 1.00059628, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.8048843093925613, + "language_loss": 0.71020877, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.73271954, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.6251566410064697 + }, + { + "auxiliary_loss_clip": 0.01136349, + "auxiliary_loss_mlp": 0.01113979, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00068235, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.8229414191687092, + "language_loss": 0.75661016, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77911353, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.5946571826934814 + }, + { + "auxiliary_loss_clip": 0.01102518, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.00162816, + "balance_loss_mlp": 1.00076985, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.010148440513857, + "language_loss": 0.73050582, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74900711, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 4.236345529556274 + }, + { + "auxiliary_loss_clip": 0.01134312, + "auxiliary_loss_mlp": 0.0111505, + "balance_loss_clip": 1.00191319, + "balance_loss_mlp": 1.00070417, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 1.8513910343724642, + "language_loss": 0.71383357, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73632717, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.6086134910583496 + }, + { + "auxiliary_loss_clip": 0.0113455, + "auxiliary_loss_mlp": 0.01114179, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00069141, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 2.064221792600983, + "language_loss": 0.74897015, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77145743, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.595029830932617 + }, + { + "auxiliary_loss_clip": 0.01134609, + "auxiliary_loss_mlp": 0.01115653, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00073504, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.862068751221493, + "language_loss": 0.71459568, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73709834, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.533690929412842 + }, + { + "auxiliary_loss_clip": 0.01152692, + "auxiliary_loss_mlp": 0.01114393, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00061965, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.3194978910837472, + "language_loss": 0.68498576, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70765662, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.7270939350128174 + }, + { + "auxiliary_loss_clip": 0.01151351, + "auxiliary_loss_mlp": 0.01115915, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00071132, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 2.71290682653191, + "language_loss": 0.81544161, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83811426, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.5407416820526123 + }, + { + "auxiliary_loss_clip": 0.01151352, + "auxiliary_loss_mlp": 0.01115318, + "balance_loss_clip": 1.00189364, + "balance_loss_mlp": 1.0005914, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.777797750989399, + "language_loss": 0.76710218, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78976882, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.508070468902588 + }, + { + "auxiliary_loss_clip": 0.01136745, + "auxiliary_loss_mlp": 0.01115346, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00071418, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.8635233858997138, + "language_loss": 0.70433849, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72685945, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 4.028327941894531 + }, + { + "auxiliary_loss_clip": 0.01118415, + "auxiliary_loss_mlp": 0.01115944, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00064445, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.8606484039945494, + "language_loss": 0.87145197, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89379561, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.643460512161255 + }, + { + "auxiliary_loss_clip": 0.01167935, + "auxiliary_loss_mlp": 0.01114671, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00070691, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.5881550938748512, + "language_loss": 0.77944231, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80226833, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 3.9857990741729736 + }, + { + "auxiliary_loss_clip": 0.01119613, + "auxiliary_loss_mlp": 0.0111381, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00060844, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.3642281362753699, + "language_loss": 0.66999757, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69233179, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.6773674488067627 + }, + { + "auxiliary_loss_clip": 0.01152483, + "auxiliary_loss_mlp": 0.01115478, + "balance_loss_clip": 1.00188303, + "balance_loss_mlp": 1.00056016, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.8683904363577195, + "language_loss": 0.73794138, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.76062107, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 2.528728723526001 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01113898, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00060177, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.5774036654697696, + "language_loss": 0.62843561, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65093154, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.6561784744262695 + }, + { + "auxiliary_loss_clip": 0.01150978, + "auxiliary_loss_mlp": 0.01114576, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00070715, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.5098760143243755, + "language_loss": 0.66127348, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68392909, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.597809314727783 + }, + { + "auxiliary_loss_clip": 0.01101984, + "auxiliary_loss_mlp": 0.01114165, + "balance_loss_clip": 1.00176024, + "balance_loss_mlp": 1.00058222, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.9167215715315564, + "language_loss": 0.67447937, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69664085, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 4.279451131820679 + }, + { + "auxiliary_loss_clip": 0.01105909, + "auxiliary_loss_mlp": 0.01115039, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.0006938, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.6890821628817432, + "language_loss": 0.75694454, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77915394, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.6949565410614014 + }, + { + "auxiliary_loss_clip": 0.01122697, + "auxiliary_loss_mlp": 0.01114481, + "balance_loss_clip": 1.00210202, + "balance_loss_mlp": 1.00070786, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.7376410365854014, + "language_loss": 0.66354406, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68591583, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.69040584564209 + }, + { + "auxiliary_loss_clip": 0.01150937, + "auxiliary_loss_mlp": 0.0074773, + "balance_loss_clip": 1.00199509, + "balance_loss_mlp": 1.00080514, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.5351524189937098, + "language_loss": 0.82889378, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84788042, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.5680439472198486 + }, + { + "auxiliary_loss_clip": 0.01119107, + "auxiliary_loss_mlp": 0.01113519, + "balance_loss_clip": 1.00177586, + "balance_loss_mlp": 1.00069952, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.8878234421325404, + "language_loss": 0.79866493, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82099122, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.607051372528076 + }, + { + "auxiliary_loss_clip": 0.01150908, + "auxiliary_loss_mlp": 0.01113954, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00065792, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.6927787597149937, + "language_loss": 0.83658779, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85923642, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.5842955112457275 + }, + { + "auxiliary_loss_clip": 0.01119157, + "auxiliary_loss_mlp": 0.01113717, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00070608, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.380946885820666, + "language_loss": 0.72991812, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.75224686, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.824471950531006 + }, + { + "auxiliary_loss_clip": 0.01135726, + "auxiliary_loss_mlp": 0.01114348, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00047898, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.444991126165569, + "language_loss": 0.81514776, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83764845, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.588408946990967 + }, + { + "auxiliary_loss_clip": 0.01167986, + "auxiliary_loss_mlp": 0.01115843, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00073481, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.434126087281717, + "language_loss": 0.81082165, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.83366001, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.4733240604400635 + }, + { + "auxiliary_loss_clip": 0.01115732, + "auxiliary_loss_mlp": 0.01094547, + "balance_loss_clip": 1.00140643, + "balance_loss_mlp": 1.00032353, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7060881311891228, + "language_loss": 0.52470815, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54681098, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.183323621749878 + }, + { + "auxiliary_loss_clip": 0.01120657, + "auxiliary_loss_mlp": 0.01113477, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00056231, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.390316746755688, + "language_loss": 0.72318316, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.74552453, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.6249496936798096 + }, + { + "auxiliary_loss_clip": 0.01136147, + "auxiliary_loss_mlp": 0.01113403, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00067878, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.5545107741176143, + "language_loss": 0.79206884, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81456441, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.594895601272583 + }, + { + "auxiliary_loss_clip": 0.0116789, + "auxiliary_loss_mlp": 0.01115827, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00071812, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 2.866041552608033, + "language_loss": 0.70245212, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72528929, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.566560983657837 + }, + { + "auxiliary_loss_clip": 0.0114712, + "auxiliary_loss_mlp": 0.01094692, + "balance_loss_clip": 1.00145042, + "balance_loss_mlp": 1.00046873, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8195573332194527, + "language_loss": 0.59671623, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61913431, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.1069107055664062 + }, + { + "auxiliary_loss_clip": 0.01101253, + "auxiliary_loss_mlp": 0.01115709, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.0006001, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.561266974770649, + "language_loss": 0.78246814, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80463779, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.6722185611724854 + }, + { + "auxiliary_loss_clip": 0.01152765, + "auxiliary_loss_mlp": 0.01114002, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00080085, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.5579086513834093, + "language_loss": 0.68583977, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70850748, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.564171075820923 + }, + { + "auxiliary_loss_clip": 0.01102361, + "auxiliary_loss_mlp": 0.01114277, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00088489, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.9281541246230327, + "language_loss": 0.76790941, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79007578, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.7063732147216797 + }, + { + "auxiliary_loss_clip": 0.01152816, + "auxiliary_loss_mlp": 0.011146, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00073123, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.7113554908030273, + "language_loss": 0.60003906, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62271321, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.594007968902588 + }, + { + "auxiliary_loss_clip": 0.01121517, + "auxiliary_loss_mlp": 0.01114211, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00072408, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.7105856732766223, + "language_loss": 0.79383582, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.8161931, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.678257703781128 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01114243, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00085115, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.5439117665982882, + "language_loss": 0.77816153, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80051363, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.685983180999756 + }, + { + "auxiliary_loss_clip": 0.01137, + "auxiliary_loss_mlp": 0.01114031, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00063908, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.6978550625219435, + "language_loss": 0.72792631, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.75043666, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.6494839191436768 + }, + { + "auxiliary_loss_clip": 0.01167727, + "auxiliary_loss_mlp": 0.00747903, + "balance_loss_clip": 1.00194657, + "balance_loss_mlp": 1.00092745, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.7169573917718652, + "language_loss": 0.74683392, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7659902, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.490772247314453 + }, + { + "auxiliary_loss_clip": 0.01136042, + "auxiliary_loss_mlp": 0.01113908, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00061107, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.419992890701693, + "language_loss": 0.79614991, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81864941, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.6239466667175293 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01114763, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00051332, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.7897854167344065, + "language_loss": 0.80555832, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82792932, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.6127521991729736 + }, + { + "auxiliary_loss_clip": 0.01167763, + "auxiliary_loss_mlp": 0.01113719, + "balance_loss_clip": 1.00206995, + "balance_loss_mlp": 1.00061369, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.198522710952587, + "language_loss": 0.76434565, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78716046, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 3.9306321144104004 + }, + { + "auxiliary_loss_clip": 0.01133935, + "auxiliary_loss_mlp": 0.01113933, + "balance_loss_clip": 1.001791, + "balance_loss_mlp": 1.0006361, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.544743722162049, + "language_loss": 0.78280532, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80528402, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 2.5862860679626465 + }, + { + "auxiliary_loss_clip": 0.01130969, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_clip": 1.00153852, + "balance_loss_mlp": 1.00030231, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8576396091567002, + "language_loss": 0.6341424, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65639734, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.0858113765716553 + }, + { + "auxiliary_loss_clip": 0.01100153, + "auxiliary_loss_mlp": 0.0109438, + "balance_loss_clip": 1.00163293, + "balance_loss_mlp": 1.00015724, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7202345108916999, + "language_loss": 0.51581955, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53776491, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.2338900566101074 + }, + { + "auxiliary_loss_clip": 0.01151319, + "auxiliary_loss_mlp": 0.01114898, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00055265, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.4614167374631641, + "language_loss": 0.77464998, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79731214, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.5895066261291504 + }, + { + "auxiliary_loss_clip": 0.01102395, + "auxiliary_loss_mlp": 0.01113982, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00058961, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.1529304165937635, + "language_loss": 0.83862072, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86078453, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.647876739501953 + }, + { + "auxiliary_loss_clip": 0.01151089, + "auxiliary_loss_mlp": 0.01114108, + "balance_loss_clip": 1.00204933, + "balance_loss_mlp": 1.00062048, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.418636633172562, + "language_loss": 0.78855336, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.81120527, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.621232748031616 + }, + { + "auxiliary_loss_clip": 0.01119513, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00070047, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.6417040321340195, + "language_loss": 0.84305179, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86538213, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 4.094848394393921 + }, + { + "auxiliary_loss_clip": 0.0112948, + "auxiliary_loss_mlp": 0.01094549, + "balance_loss_clip": 1.00112557, + "balance_loss_mlp": 1.00032568, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7284713919985801, + "language_loss": 0.54157543, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56381571, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.0627152919769287 + }, + { + "auxiliary_loss_clip": 0.01130568, + "auxiliary_loss_mlp": 0.01094665, + "balance_loss_clip": 1.00141776, + "balance_loss_mlp": 1.00044239, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8075005701240167, + "language_loss": 0.63266408, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65491641, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 4.631804943084717 + }, + { + "auxiliary_loss_clip": 0.01087971, + "auxiliary_loss_mlp": 0.01114316, + "balance_loss_clip": 1.00159574, + "balance_loss_mlp": 1.00063837, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.3544462562821737, + "language_loss": 0.69823492, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72025776, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.772594451904297 + }, + { + "auxiliary_loss_clip": 0.01136605, + "auxiliary_loss_mlp": 0.00747773, + "balance_loss_clip": 1.00206077, + "balance_loss_mlp": 1.0009501, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.1789113798804713, + "language_loss": 0.81963766, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83848143, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.562220811843872 + }, + { + "auxiliary_loss_clip": 0.01167778, + "auxiliary_loss_mlp": 0.01114654, + "balance_loss_clip": 1.00215018, + "balance_loss_mlp": 1.00059485, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.9678557875579217, + "language_loss": 0.72442752, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74725187, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.5197935104370117 + }, + { + "auxiliary_loss_clip": 0.01135436, + "auxiliary_loss_mlp": 0.01113114, + "balance_loss_clip": 1.00173378, + "balance_loss_mlp": 1.00058091, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.555994982433563, + "language_loss": 0.81993973, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84242523, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.614182949066162 + }, + { + "auxiliary_loss_clip": 0.01119427, + "auxiliary_loss_mlp": 0.01114279, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00069618, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.453250164036686, + "language_loss": 0.74188679, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76422393, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 4.165370941162109 + }, + { + "auxiliary_loss_clip": 0.01121146, + "auxiliary_loss_mlp": 0.0074787, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00090659, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.201359104614297, + "language_loss": 0.82248718, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84117734, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.692697763442993 + }, + { + "auxiliary_loss_clip": 0.01133998, + "auxiliary_loss_mlp": 0.01114077, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00058961, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.3160147606124746, + "language_loss": 0.75290728, + "learning_rate": 1.842237354749146e-06, + "loss": 0.775388, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.6793289184570312 + }, + { + "auxiliary_loss_clip": 0.01149034, + "auxiliary_loss_mlp": 0.01094232, + "balance_loss_clip": 1.00144672, + "balance_loss_mlp": 1.00000906, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8766090174393517, + "language_loss": 0.60322952, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62566215, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.163465738296509 + }, + { + "auxiliary_loss_clip": 0.01152875, + "auxiliary_loss_mlp": 0.01113735, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00081968, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.344542514178461, + "language_loss": 0.7843554, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80702156, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.586332082748413 + }, + { + "auxiliary_loss_clip": 0.01152121, + "auxiliary_loss_mlp": 0.01115518, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00069594, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.0339182483205587, + "language_loss": 0.74093938, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76361573, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.606382131576538 + }, + { + "auxiliary_loss_clip": 0.01163879, + "auxiliary_loss_mlp": 0.01094431, + "balance_loss_clip": 1.00144386, + "balance_loss_mlp": 1.00020778, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7750129414805194, + "language_loss": 0.51058447, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.5331676, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.080207109451294 + }, + { + "auxiliary_loss_clip": 0.0115294, + "auxiliary_loss_mlp": 0.0111386, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00084996, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5208318787750168, + "language_loss": 0.72239602, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74506402, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.62715482711792 + }, + { + "auxiliary_loss_clip": 0.01152761, + "auxiliary_loss_mlp": 0.00747722, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00090432, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.7873847062721786, + "language_loss": 0.69775015, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71675497, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.564208984375 + }, + { + "auxiliary_loss_clip": 0.01086274, + "auxiliary_loss_mlp": 0.01114571, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.00060713, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.6039318522653594, + "language_loss": 0.72343719, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74544561, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.677896738052368 + }, + { + "auxiliary_loss_clip": 0.01118339, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00076628, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 1.7224194519641345, + "language_loss": 0.73847568, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76081598, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.586930513381958 + }, + { + "auxiliary_loss_clip": 0.01090828, + "auxiliary_loss_mlp": 0.01114467, + "balance_loss_clip": 1.00188768, + "balance_loss_mlp": 1.00078917, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 1.966509715686132, + "language_loss": 0.77011538, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79216826, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.707934617996216 + }, + { + "auxiliary_loss_clip": 0.01167647, + "auxiliary_loss_mlp": 0.01113744, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00063848, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.9850907908346864, + "language_loss": 0.81992853, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84274244, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.5847039222717285 + }, + { + "auxiliary_loss_clip": 0.01152278, + "auxiliary_loss_mlp": 0.01114473, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00070012, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.6385974558385823, + "language_loss": 0.66598368, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.6886512, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.5852739810943604 + }, + { + "auxiliary_loss_clip": 0.01119938, + "auxiliary_loss_mlp": 0.00747755, + "balance_loss_clip": 1.0017128, + "balance_loss_mlp": 1.00096297, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.387592068970461, + "language_loss": 0.82603419, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84471112, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.651313304901123 + }, + { + "auxiliary_loss_clip": 0.01102778, + "auxiliary_loss_mlp": 0.01113865, + "balance_loss_clip": 1.00171161, + "balance_loss_mlp": 1.00066364, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7951183635840204, + "language_loss": 0.70779347, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72995985, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.683678388595581 + }, + { + "auxiliary_loss_clip": 0.01167838, + "auxiliary_loss_mlp": 0.01114285, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00051141, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.6743412069264887, + "language_loss": 0.79926491, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.8220861, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.539112091064453 + }, + { + "auxiliary_loss_clip": 0.01119479, + "auxiliary_loss_mlp": 0.01112859, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00051653, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.385572371125959, + "language_loss": 0.78582335, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80814672, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.656750202178955 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01113386, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00056672, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.6287639170171198, + "language_loss": 0.77110881, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79376727, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.5663931369781494 + }, + { + "auxiliary_loss_clip": 0.01134432, + "auxiliary_loss_mlp": 0.01113664, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.0005579, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1.7798170027598166, + "language_loss": 0.71074158, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.7332226, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.583465814590454 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01113919, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00052762, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.3659530206344526, + "language_loss": 0.67537057, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69752902, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.7240798473358154 + }, + { + "auxiliary_loss_clip": 0.0115076, + "auxiliary_loss_mlp": 0.01114403, + "balance_loss_clip": 1.00185239, + "balance_loss_mlp": 1.0008204, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.3969582269656817, + "language_loss": 0.77645862, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79911023, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.5605859756469727 + }, + { + "auxiliary_loss_clip": 0.01150761, + "auxiliary_loss_mlp": 0.01112781, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00043869, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.8334957858760992, + "language_loss": 0.69328374, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71591914, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.52996826171875 + }, + { + "auxiliary_loss_clip": 0.01089154, + "auxiliary_loss_mlp": 0.01112826, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00057817, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.799040975301383, + "language_loss": 0.76264477, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78466457, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 4.1165771484375 + }, + { + "auxiliary_loss_clip": 0.01135868, + "auxiliary_loss_mlp": 0.01114119, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.0005362, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.0804075085333857, + "language_loss": 0.76154828, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78404814, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.559884786605835 + }, + { + "auxiliary_loss_clip": 0.01152569, + "auxiliary_loss_mlp": 0.01113371, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.0005517, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6354826331657448, + "language_loss": 0.70326197, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72592133, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.584306001663208 + }, + { + "auxiliary_loss_clip": 0.01152759, + "auxiliary_loss_mlp": 0.01114223, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00044942, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 2.108236568547186, + "language_loss": 0.75304317, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77571303, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.5792582035064697 + }, + { + "auxiliary_loss_clip": 0.0115055, + "auxiliary_loss_mlp": 0.01112733, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00058055, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.7369292817106663, + "language_loss": 0.73737311, + "learning_rate": 1.832533059471282e-06, + "loss": 0.76000601, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.5589065551757812 + }, + { + "auxiliary_loss_clip": 0.01104223, + "auxiliary_loss_mlp": 0.01111911, + "balance_loss_clip": 1.00178134, + "balance_loss_mlp": 1.00061679, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7866356828622458, + "language_loss": 0.73349756, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75565886, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.6860196590423584 + }, + { + "auxiliary_loss_clip": 0.01167635, + "auxiliary_loss_mlp": 0.01113847, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00045478, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 1.8987371443390035, + "language_loss": 0.71843588, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74125063, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.50626277923584 + }, + { + "auxiliary_loss_clip": 0.01118615, + "auxiliary_loss_mlp": 0.01113292, + "balance_loss_clip": 1.00176156, + "balance_loss_mlp": 1.00047219, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.4061403136974013, + "language_loss": 0.70441198, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72673106, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 2.8831889629364014 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.0111368, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00047922, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.010648198805368, + "language_loss": 0.80561829, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82811874, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 3.995502471923828 + }, + { + "auxiliary_loss_clip": 0.01106203, + "auxiliary_loss_mlp": 0.01113197, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00056815, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.460105019268539, + "language_loss": 0.73446339, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75665736, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 4.21950101852417 + }, + { + "auxiliary_loss_clip": 0.0112069, + "auxiliary_loss_mlp": 0.01113687, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00058174, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.018625017033022, + "language_loss": 0.84489942, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.86724317, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.5905425548553467 + }, + { + "auxiliary_loss_clip": 0.01103963, + "auxiliary_loss_mlp": 0.01111875, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00058115, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.6234212377418633, + "language_loss": 0.77482861, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79698694, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.6741833686828613 + }, + { + "auxiliary_loss_clip": 0.01150238, + "auxiliary_loss_mlp": 0.01112975, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.0005368, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.6073098880498728, + "language_loss": 0.69748974, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.72012186, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.5763750076293945 + }, + { + "auxiliary_loss_clip": 0.01147006, + "auxiliary_loss_mlp": 0.01093475, + "balance_loss_clip": 1.00120521, + "balance_loss_mlp": 1.00001454, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9688091499403406, + "language_loss": 0.59144831, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.6138531, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.2347846031188965 + }, + { + "auxiliary_loss_clip": 0.01167775, + "auxiliary_loss_mlp": 0.0074781, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00092363, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.8217283486418345, + "language_loss": 0.77965069, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79880655, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 3.9275660514831543 + }, + { + "auxiliary_loss_clip": 0.01136181, + "auxiliary_loss_mlp": 0.01111969, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.0006752, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.813980187389418, + "language_loss": 0.83048248, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85296392, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.5970139503479004 + }, + { + "auxiliary_loss_clip": 0.01151023, + "auxiliary_loss_mlp": 0.01113408, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00058866, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 1.743295557514784, + "language_loss": 0.66658539, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.68922973, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.594511032104492 + }, + { + "auxiliary_loss_clip": 0.01167702, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00079322, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.566811078626713, + "language_loss": 0.74022132, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76303256, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.506679058074951 + }, + { + "auxiliary_loss_clip": 0.01100988, + "auxiliary_loss_mlp": 0.01114061, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00076413, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.0120489651041233, + "language_loss": 0.8778891, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90003955, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.647844076156616 + }, + { + "auxiliary_loss_clip": 0.011676, + "auxiliary_loss_mlp": 0.01112897, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00055385, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.8630120954036662, + "language_loss": 0.65192145, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67472643, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.593893527984619 + }, + { + "auxiliary_loss_clip": 0.0115088, + "auxiliary_loss_mlp": 0.01112803, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.00074637, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.551512368708606, + "language_loss": 0.7915411, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81417793, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.596468448638916 + }, + { + "auxiliary_loss_clip": 0.01167676, + "auxiliary_loss_mlp": 0.01112753, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00060058, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.7255284773992146, + "language_loss": 0.74097151, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76377577, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.5204014778137207 + }, + { + "auxiliary_loss_clip": 0.01116714, + "auxiliary_loss_mlp": 0.01113909, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 1.900450353233848, + "language_loss": 0.72149163, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.7437979, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.6431450843811035 + }, + { + "auxiliary_loss_clip": 0.01135978, + "auxiliary_loss_mlp": 0.01113188, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00065446, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.4283695985077076, + "language_loss": 0.80473763, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82722926, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.5867083072662354 + }, + { + "auxiliary_loss_clip": 0.01151203, + "auxiliary_loss_mlp": 0.01114201, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00071383, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.2010653184717466, + "language_loss": 0.8131755, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83582956, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.52374267578125 + }, + { + "auxiliary_loss_clip": 0.01167415, + "auxiliary_loss_mlp": 0.01112669, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.0006125, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.5787641086585824, + "language_loss": 0.81190771, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83470857, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.5120158195495605 + }, + { + "auxiliary_loss_clip": 0.01167548, + "auxiliary_loss_mlp": 0.01113401, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00067711, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 2.2356958031192247, + "language_loss": 0.77438104, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79719055, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.5057783126831055 + }, + { + "auxiliary_loss_clip": 0.01167577, + "auxiliary_loss_mlp": 0.0111418, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00069261, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.5993280297297807, + "language_loss": 0.66389251, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68671012, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.7728676795959473 + }, + { + "auxiliary_loss_clip": 0.01152333, + "auxiliary_loss_mlp": 0.01112819, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00057125, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.9910433088738186, + "language_loss": 0.69736779, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72001928, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.624220609664917 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00066495, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4489667253822325, + "language_loss": 0.80385309, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82615888, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.708726644515991 + }, + { + "auxiliary_loss_clip": 0.01102099, + "auxiliary_loss_mlp": 0.01113207, + "balance_loss_clip": 1.00171268, + "balance_loss_mlp": 1.00076914, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.4498279066703772, + "language_loss": 0.78873557, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81088865, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.6914472579956055 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.00747723, + "balance_loss_clip": 1.00194585, + "balance_loss_mlp": 1.00088763, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 2.332768197863813, + "language_loss": 0.81839585, + "learning_rate": 1.822056885403915e-06, + "loss": 0.8372345, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.6327178478240967 + }, + { + "auxiliary_loss_clip": 0.01151109, + "auxiliary_loss_mlp": 0.01112544, + "balance_loss_clip": 1.00187266, + "balance_loss_mlp": 1.00058246, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.5890948053102394, + "language_loss": 0.71456528, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73720181, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.5760910511016846 + }, + { + "auxiliary_loss_clip": 0.01152188, + "auxiliary_loss_mlp": 0.01112804, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00046146, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.6138519773413074, + "language_loss": 0.64984357, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67249346, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.6451377868652344 + }, + { + "auxiliary_loss_clip": 0.01119912, + "auxiliary_loss_mlp": 0.00747709, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00093031, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 2.028280578163184, + "language_loss": 0.73898298, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.7576592, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.6522555351257324 + }, + { + "auxiliary_loss_clip": 0.01136003, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00057077, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 2.6291357657580563, + "language_loss": 0.78550613, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.808002, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.6449787616729736 + }, + { + "auxiliary_loss_clip": 0.01117407, + "auxiliary_loss_mlp": 0.01092739, + "balance_loss_clip": 1.00151134, + "balance_loss_mlp": 1.0000422, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7441819719157698, + "language_loss": 0.56528497, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58738637, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.248469352722168 + }, + { + "auxiliary_loss_clip": 0.01103951, + "auxiliary_loss_mlp": 0.01112863, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00051999, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.064369654662236, + "language_loss": 0.77832139, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80048949, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.683988571166992 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.01113582, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00057173, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.502216713942186, + "language_loss": 0.83137834, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85359037, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 4.1417717933654785 + }, + { + "auxiliary_loss_clip": 0.01167586, + "auxiliary_loss_mlp": 0.01113077, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00054371, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5239095256670825, + "language_loss": 0.75089586, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.7737025, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.594388008117676 + }, + { + "auxiliary_loss_clip": 0.01152325, + "auxiliary_loss_mlp": 0.01111973, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00067925, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9354913941878997, + "language_loss": 0.85647023, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87911308, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.619345188140869 + }, + { + "auxiliary_loss_clip": 0.01136035, + "auxiliary_loss_mlp": 0.01113884, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00077796, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.6382324314129617, + "language_loss": 0.73928237, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76178157, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.678492307662964 + }, + { + "auxiliary_loss_clip": 0.01117508, + "auxiliary_loss_mlp": 0.01113142, + "balance_loss_clip": 1.0016551, + "balance_loss_mlp": 1.00070417, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.5710672861919324, + "language_loss": 0.7590608, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.7813673, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.743520498275757 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01112204, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00062418, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7099352742145486, + "language_loss": 0.84150231, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86379445, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.679752826690674 + }, + { + "auxiliary_loss_clip": 0.0111319, + "auxiliary_loss_mlp": 0.01093556, + "balance_loss_clip": 1.00128531, + "balance_loss_mlp": 1.00009561, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.720147900022754, + "language_loss": 0.55919814, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58126557, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.181480646133423 + }, + { + "auxiliary_loss_clip": 0.01085624, + "auxiliary_loss_mlp": 0.01113762, + "balance_loss_clip": 1.00160527, + "balance_loss_mlp": 1.00046527, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.644050644914603, + "language_loss": 0.75154018, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77353406, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 4.211387634277344 + }, + { + "auxiliary_loss_clip": 0.01134029, + "auxiliary_loss_mlp": 0.01112883, + "balance_loss_clip": 1.00199258, + "balance_loss_mlp": 1.00063562, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.8451252706681902, + "language_loss": 0.66853762, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.69100678, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 4.15625262260437 + }, + { + "auxiliary_loss_clip": 0.01152122, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00057769, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.7887857025860459, + "language_loss": 0.7776773, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80033249, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.6114861965179443 + }, + { + "auxiliary_loss_clip": 0.01120976, + "auxiliary_loss_mlp": 0.01113325, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00060105, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.7681477742236145, + "language_loss": 0.76260293, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78494596, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.720433235168457 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01093715, + "balance_loss_clip": 1.00120032, + "balance_loss_mlp": 1.00025451, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6725028320953209, + "language_loss": 0.52437228, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54661208, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.161146402359009 + }, + { + "auxiliary_loss_clip": 0.01134117, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.00067449, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5968080399764941, + "language_loss": 0.76361465, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78608692, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.676785707473755 + }, + { + "auxiliary_loss_clip": 0.01117361, + "auxiliary_loss_mlp": 0.01113131, + "balance_loss_clip": 1.00168383, + "balance_loss_mlp": 1.00078845, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.4993150149031944, + "language_loss": 0.67360306, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69590795, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.640136957168579 + }, + { + "auxiliary_loss_clip": 0.01120852, + "auxiliary_loss_mlp": 0.01112876, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00062811, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.4757852891275542, + "language_loss": 0.84380686, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86614418, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 4.106127500534058 + }, + { + "auxiliary_loss_clip": 0.01167645, + "auxiliary_loss_mlp": 0.01113097, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00056326, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.5022989813401957, + "language_loss": 0.61798465, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64079213, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.5599820613861084 + }, + { + "auxiliary_loss_clip": 0.01167617, + "auxiliary_loss_mlp": 0.0111331, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00068104, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.6313639144307646, + "language_loss": 0.70103323, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.7238425, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.5694704055786133 + }, + { + "auxiliary_loss_clip": 0.01167331, + "auxiliary_loss_mlp": 0.01112123, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00054336, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.5623136721006077, + "language_loss": 0.7713728, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.7941674, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.596646547317505 + }, + { + "auxiliary_loss_clip": 0.01135817, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00081325, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 2.0566809149914254, + "language_loss": 0.72873163, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75122803, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.6020195484161377 + }, + { + "auxiliary_loss_clip": 0.0110744, + "auxiliary_loss_mlp": 0.01113704, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.0007894, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 1.9479427404281229, + "language_loss": 0.93748486, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95969629, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.7093992233276367 + }, + { + "auxiliary_loss_clip": 0.01152462, + "auxiliary_loss_mlp": 0.01112264, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00058818, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.6605975664638744, + "language_loss": 0.74071717, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76336443, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.6166458129882812 + }, + { + "auxiliary_loss_clip": 0.01150748, + "auxiliary_loss_mlp": 0.01112611, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00065017, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 3.9368023046856715, + "language_loss": 0.6688273, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69146085, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.6568870544433594 + }, + { + "auxiliary_loss_clip": 0.01118821, + "auxiliary_loss_mlp": 0.01111869, + "balance_loss_clip": 1.00171161, + "balance_loss_mlp": 1.0005753, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6518909925130827, + "language_loss": 0.67549354, + "learning_rate": 1.810810185460011e-06, + "loss": 0.6978004, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.7913296222686768 + }, + { + "auxiliary_loss_clip": 0.01167554, + "auxiliary_loss_mlp": 0.01113503, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00077868, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.647864324183153, + "language_loss": 0.92912698, + "learning_rate": 1.810422473773436e-06, + "loss": 0.9519375, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.583523750305176 + }, + { + "auxiliary_loss_clip": 0.0113514, + "auxiliary_loss_mlp": 0.01113962, + "balance_loss_clip": 1.0019697, + "balance_loss_mlp": 1.0006659, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 1.8081571177418907, + "language_loss": 0.83298427, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85547531, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.6225264072418213 + }, + { + "auxiliary_loss_clip": 0.01137162, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_clip": 1.00221825, + "balance_loss_mlp": 1.0006659, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.007395409978282, + "language_loss": 0.6859504, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70845783, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.640935182571411 + }, + { + "auxiliary_loss_clip": 0.01117024, + "auxiliary_loss_mlp": 0.01093072, + "balance_loss_clip": 1.00173509, + "balance_loss_mlp": 0.9999935, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7394198775709627, + "language_loss": 0.57689965, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59900069, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.2418100833892822 + }, + { + "auxiliary_loss_clip": 0.01117391, + "auxiliary_loss_mlp": 0.01113643, + "balance_loss_clip": 1.00172555, + "balance_loss_mlp": 1.00044143, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.745215776371576, + "language_loss": 0.69165498, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.7139653, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.649724006652832 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01111863, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00056899, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 1.9030358334002, + "language_loss": 0.74702764, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.76965237, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.6527490615844727 + }, + { + "auxiliary_loss_clip": 0.011163, + "auxiliary_loss_mlp": 0.01093238, + "balance_loss_clip": 1.0015316, + "balance_loss_mlp": 1.00015914, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.788608108890177, + "language_loss": 0.62673563, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64883101, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.3118064403533936 + }, + { + "auxiliary_loss_clip": 0.01150647, + "auxiliary_loss_mlp": 0.01112389, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.0006187, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.6213418512317714, + "language_loss": 0.79312086, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81575125, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.01152564, + "auxiliary_loss_mlp": 0.01112822, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00057411, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.724277297427938, + "language_loss": 0.7974326, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.82008648, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.6491477489471436 + }, + { + "auxiliary_loss_clip": 0.0115069, + "auxiliary_loss_mlp": 0.01112177, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00050175, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.5185711530987824, + "language_loss": 0.86861265, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89124131, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.5885426998138428 + }, + { + "auxiliary_loss_clip": 0.01135295, + "auxiliary_loss_mlp": 0.01114895, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00054955, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.7193614678579252, + "language_loss": 0.82328176, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84578371, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.622539758682251 + }, + { + "auxiliary_loss_clip": 0.01167538, + "auxiliary_loss_mlp": 0.01113886, + "balance_loss_clip": 1.0019635, + "balance_loss_mlp": 1.000494, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.5487732563383174, + "language_loss": 0.63674802, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65956223, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.549179792404175 + }, + { + "auxiliary_loss_clip": 0.01167651, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00077116, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.4865138733725052, + "language_loss": 0.79899669, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82180625, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.6043639183044434 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01111503, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00059104, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.0912985388992618, + "language_loss": 0.78127515, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80356979, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.7171008586883545 + }, + { + "auxiliary_loss_clip": 0.01149976, + "auxiliary_loss_mlp": 0.01113787, + "balance_loss_clip": 1.00220561, + "balance_loss_mlp": 1.00058627, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.706620551843284, + "language_loss": 0.75966549, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78230309, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.6509101390838623 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01113267, + "balance_loss_clip": 1.00193608, + "balance_loss_mlp": 1.00073338, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2093095668729634, + "language_loss": 0.63576776, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65807813, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.838975667953491 + }, + { + "auxiliary_loss_clip": 0.01117833, + "auxiliary_loss_mlp": 0.01112859, + "balance_loss_clip": 1.00226963, + "balance_loss_mlp": 1.00080204, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5115912524321475, + "language_loss": 0.72024488, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.7425518, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 4.156120777130127 + }, + { + "auxiliary_loss_clip": 0.01167526, + "auxiliary_loss_mlp": 0.01112536, + "balance_loss_clip": 1.00214207, + "balance_loss_mlp": 1.00057507, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.6704402852604963, + "language_loss": 0.73982966, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76263034, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.5477802753448486 + }, + { + "auxiliary_loss_clip": 0.01150814, + "auxiliary_loss_mlp": 0.01113044, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00051081, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.7751739031665297, + "language_loss": 0.61094379, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.63358235, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.5865590572357178 + }, + { + "auxiliary_loss_clip": 0.01163809, + "auxiliary_loss_mlp": 0.01092684, + "balance_loss_clip": 1.00145125, + "balance_loss_mlp": 0.99998707, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.7002241731146935, + "language_loss": 0.57101846, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59358341, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.2159745693206787 + }, + { + "auxiliary_loss_clip": 0.01135529, + "auxiliary_loss_mlp": 0.01113016, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00067306, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.5788977890711244, + "language_loss": 0.70074505, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72323048, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.6400082111358643 + }, + { + "auxiliary_loss_clip": 0.01135242, + "auxiliary_loss_mlp": 0.01111263, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00063646, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.939101656795049, + "language_loss": 0.71657288, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73903793, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 4.055600166320801 + }, + { + "auxiliary_loss_clip": 0.01150674, + "auxiliary_loss_mlp": 0.01112456, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00068593, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7155930612185304, + "language_loss": 0.68576854, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70839989, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.572291374206543 + }, + { + "auxiliary_loss_clip": 0.0115099, + "auxiliary_loss_mlp": 0.01112287, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00061178, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6091594763612298, + "language_loss": 0.80993885, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83257163, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 4.016146183013916 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01113172, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00063825, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.6239357323004346, + "language_loss": 0.80152923, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82418227, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.628117084503174 + }, + { + "auxiliary_loss_clip": 0.0114947, + "auxiliary_loss_mlp": 0.0111224, + "balance_loss_clip": 1.00221229, + "balance_loss_mlp": 1.00056505, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.8491378094444828, + "language_loss": 0.68014866, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70276576, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 2.694459915161133 + }, + { + "auxiliary_loss_clip": 0.01152244, + "auxiliary_loss_mlp": 0.01113533, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00061822, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.7062298350025484, + "language_loss": 0.81057262, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83323038, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.6930055618286133 + }, + { + "auxiliary_loss_clip": 0.01167631, + "auxiliary_loss_mlp": 0.01113685, + "balance_loss_clip": 1.00196648, + "balance_loss_mlp": 1.00067449, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.5167340218687992, + "language_loss": 0.7559188, + "learning_rate": 1.799957023759277e-06, + "loss": 0.778732, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.6184439659118652 + }, + { + "auxiliary_loss_clip": 0.01119292, + "auxiliary_loss_mlp": 0.01112661, + "balance_loss_clip": 1.0016681, + "balance_loss_mlp": 1.00060391, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.1522663209957122, + "language_loss": 0.83259726, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85491675, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 4.224048614501953 + }, + { + "auxiliary_loss_clip": 0.01167719, + "auxiliary_loss_mlp": 0.01113961, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00056887, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.7133285181114726, + "language_loss": 0.70261925, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72543603, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.560899019241333 + }, + { + "auxiliary_loss_clip": 0.01167272, + "auxiliary_loss_mlp": 0.01112471, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00050974, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.489510677474821, + "language_loss": 0.66040528, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68320274, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.6335973739624023 + }, + { + "auxiliary_loss_clip": 0.01133949, + "auxiliary_loss_mlp": 0.01112187, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00060666, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6077879175303276, + "language_loss": 0.78843695, + "learning_rate": 1.798407050044766e-06, + "loss": 0.8108983, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.6600608825683594 + }, + { + "auxiliary_loss_clip": 0.01150744, + "auxiliary_loss_mlp": 0.01113344, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00071502, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.63558132085192, + "language_loss": 0.75306278, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77570367, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.5705668926239014 + }, + { + "auxiliary_loss_clip": 0.01136032, + "auxiliary_loss_mlp": 0.01112946, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00069845, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.359937713758477, + "language_loss": 0.75220656, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77469635, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.643627882003784 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01111782, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.00048804, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.428451636030071, + "language_loss": 0.76603544, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.78866303, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.6119439601898193 + }, + { + "auxiliary_loss_clip": 0.01150728, + "auxiliary_loss_mlp": 0.01113205, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.0005765, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.7078020769439233, + "language_loss": 0.77805579, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.80069512, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 2.552215576171875 + }, + { + "auxiliary_loss_clip": 0.010682, + "auxiliary_loss_mlp": 0.01092442, + "balance_loss_clip": 1.0014174, + "balance_loss_mlp": 1.00012612, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7261179039384994, + "language_loss": 0.57686162, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59846807, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.4752821922302246 + }, + { + "auxiliary_loss_clip": 0.01120231, + "auxiliary_loss_mlp": 0.01113156, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.6372821476483737, + "language_loss": 0.76753038, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78986424, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 3.0948896408081055 + }, + { + "auxiliary_loss_clip": 0.01152954, + "auxiliary_loss_mlp": 0.01113966, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00076509, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 1.6772020084708799, + "language_loss": 0.73682714, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75949633, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.6181230545043945 + }, + { + "auxiliary_loss_clip": 0.01135922, + "auxiliary_loss_mlp": 0.01114054, + "balance_loss_clip": 1.00191677, + "balance_loss_mlp": 1.00066233, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.7018743002558339, + "language_loss": 0.7806282, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.803128, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.6769254207611084 + }, + { + "auxiliary_loss_clip": 0.01167553, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00050831, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.1618798405054984, + "language_loss": 0.74686742, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76967907, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.5747907161712646 + }, + { + "auxiliary_loss_clip": 0.01152041, + "auxiliary_loss_mlp": 0.01113622, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00051594, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 1.7300873739130183, + "language_loss": 0.68594122, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.70859778, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.604017734527588 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_clip": 1.00183213, + "balance_loss_mlp": 1.00063682, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.0604490171266012, + "language_loss": 0.68415737, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70662034, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.6551315784454346 + }, + { + "auxiliary_loss_clip": 0.01118694, + "auxiliary_loss_mlp": 0.01112904, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00075173, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.5779867306705275, + "language_loss": 0.66458762, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6869036, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.7415566444396973 + }, + { + "auxiliary_loss_clip": 0.01117936, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_clip": 1.00154555, + "balance_loss_mlp": 1.00026131, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7473015425034043, + "language_loss": 0.57559276, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59770173, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.337308883666992 + }, + { + "auxiliary_loss_clip": 0.0114768, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_clip": 1.00170767, + "balance_loss_mlp": 1.00005269, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9048157612470453, + "language_loss": 0.64756286, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66996336, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.080606460571289 + }, + { + "auxiliary_loss_clip": 0.01152604, + "auxiliary_loss_mlp": 0.0111358, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00066543, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.455684130327365, + "language_loss": 0.73009062, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75275242, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.620634078979492 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01112415, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00064433, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.712127935859173, + "language_loss": 0.72593969, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74840283, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.678621768951416 + }, + { + "auxiliary_loss_clip": 0.01150672, + "auxiliary_loss_mlp": 0.00747769, + "balance_loss_clip": 1.00194919, + "balance_loss_mlp": 1.00105226, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.9412438345394731, + "language_loss": 0.67570841, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69469279, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.762294292449951 + }, + { + "auxiliary_loss_clip": 0.01167488, + "auxiliary_loss_mlp": 0.01112317, + "balance_loss_clip": 1.00199473, + "balance_loss_mlp": 1.0006417, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.7694706768764599, + "language_loss": 0.7774061, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80020416, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.581918954849243 + }, + { + "auxiliary_loss_clip": 0.01120465, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00067127, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.3854383558187988, + "language_loss": 0.72448468, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74681664, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.7024805545806885 + }, + { + "auxiliary_loss_clip": 0.01134568, + "auxiliary_loss_mlp": 0.01112374, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00050783, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3032762301726304, + "language_loss": 0.65289056, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67536002, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 2.952423095703125 + }, + { + "auxiliary_loss_clip": 0.0115097, + "auxiliary_loss_mlp": 0.01113269, + "balance_loss_clip": 1.00198138, + "balance_loss_mlp": 1.00054526, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.6614189154649706, + "language_loss": 0.81372994, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83637226, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.5743184089660645 + }, + { + "auxiliary_loss_clip": 0.01167316, + "auxiliary_loss_mlp": 0.011123, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00052953, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.439513789290818, + "language_loss": 0.80613708, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.8289333, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 4.31778359413147 + }, + { + "auxiliary_loss_clip": 0.01151672, + "auxiliary_loss_mlp": 0.01111494, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00058174, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.742299920577431, + "language_loss": 0.69746178, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72009349, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.572113275527954 + }, + { + "auxiliary_loss_clip": 0.01152139, + "auxiliary_loss_mlp": 0.01113149, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00042439, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.7135822301687007, + "language_loss": 0.6331349, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65578771, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.5666611194610596 + }, + { + "auxiliary_loss_clip": 0.01167411, + "auxiliary_loss_mlp": 0.01111806, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00041699, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6056345527617932, + "language_loss": 0.75268936, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77548152, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.502558946609497 + }, + { + "auxiliary_loss_clip": 0.01133354, + "auxiliary_loss_mlp": 0.01112132, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.0006479, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.8701205525060385, + "language_loss": 0.774499, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.7969538, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.5638554096221924 + }, + { + "auxiliary_loss_clip": 0.01150337, + "auxiliary_loss_mlp": 0.01111932, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00063872, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.7940583564027577, + "language_loss": 0.7128638, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73548651, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.587717056274414 + }, + { + "auxiliary_loss_clip": 0.0115056, + "auxiliary_loss_mlp": 0.01111856, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00056219, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 2.45129013696288, + "language_loss": 0.71000975, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73263395, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 3.999392509460449 + }, + { + "auxiliary_loss_clip": 0.01100134, + "auxiliary_loss_mlp": 0.01113284, + "balance_loss_clip": 1.00191963, + "balance_loss_mlp": 1.00065494, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 1.8597795204531624, + "language_loss": 0.877262, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.89939618, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.6311838626861572 + }, + { + "auxiliary_loss_clip": 0.01087534, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00048196, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 2.197587675201775, + "language_loss": 0.73228782, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75427997, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 4.128835678100586 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00102866, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.4593849854752745, + "language_loss": 0.72137052, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74020886, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.651106834411621 + }, + { + "auxiliary_loss_clip": 0.01118798, + "auxiliary_loss_mlp": 0.00747909, + "balance_loss_clip": 1.00170279, + "balance_loss_mlp": 1.00113559, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.663443750417801, + "language_loss": 0.72216046, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74082756, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 2.6430764198303223 + }, + { + "auxiliary_loss_clip": 0.01136007, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00075126, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.8515430700375597, + "language_loss": 0.76142627, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78390491, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.6141693592071533 + }, + { + "auxiliary_loss_clip": 0.01121425, + "auxiliary_loss_mlp": 0.01111904, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.5766563389344506, + "language_loss": 0.6248185, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64715177, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 2.7361886501312256 + }, + { + "auxiliary_loss_clip": 0.01167625, + "auxiliary_loss_mlp": 0.01112951, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00060844, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.9452577723334568, + "language_loss": 0.79307628, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81588209, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 3.9117774963378906 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.00747728, + "balance_loss_clip": 1.00198829, + "balance_loss_mlp": 1.00109315, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.662408121300061, + "language_loss": 0.82471389, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84353054, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.6523094177246094 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01112938, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00069094, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.7910054842856002, + "language_loss": 0.80466044, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82696283, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.626863479614258 + }, + { + "auxiliary_loss_clip": 0.01103345, + "auxiliary_loss_mlp": 0.01112423, + "balance_loss_clip": 1.00175524, + "balance_loss_mlp": 1.00065219, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.05333606173609, + "language_loss": 0.61038768, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63254535, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.7058329582214355 + }, + { + "auxiliary_loss_clip": 0.01135306, + "auxiliary_loss_mlp": 0.01111921, + "balance_loss_clip": 1.00193727, + "balance_loss_mlp": 1.00062776, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 2.045747812395518, + "language_loss": 0.71568435, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73815662, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.632676362991333 + }, + { + "auxiliary_loss_clip": 0.0116739, + "auxiliary_loss_mlp": 0.01112352, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00048625, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 2.0456485058245706, + "language_loss": 0.83385003, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85664749, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.498694658279419 + }, + { + "auxiliary_loss_clip": 0.01134376, + "auxiliary_loss_mlp": 0.01111558, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00055051, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.5302049839033702, + "language_loss": 0.80480981, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82726914, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.6405439376831055 + }, + { + "auxiliary_loss_clip": 0.01150907, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_clip": 1.00211668, + "balance_loss_mlp": 1.00057411, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 1.9814327967625256, + "language_loss": 0.74957407, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77220368, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.5241849422454834 + }, + { + "auxiliary_loss_clip": 0.01152577, + "auxiliary_loss_mlp": 0.01113303, + "balance_loss_clip": 1.0019325, + "balance_loss_mlp": 1.00057888, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 3.2994839184526095, + "language_loss": 0.66603148, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.6886903, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.515450954437256 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.01113114, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.0006758, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.678938965769744, + "language_loss": 0.83116353, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85348815, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.6000564098358154 + }, + { + "auxiliary_loss_clip": 0.01121069, + "auxiliary_loss_mlp": 0.01112005, + "balance_loss_clip": 1.00201786, + "balance_loss_mlp": 1.00061536, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.6822531615448038, + "language_loss": 0.74156713, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.7638979, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.62526273727417 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01113841, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00063992, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.6286977752460854, + "language_loss": 0.634799, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65715396, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.6061770915985107 + }, + { + "auxiliary_loss_clip": 0.01167506, + "auxiliary_loss_mlp": 0.00747767, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00112581, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.6428858991454556, + "language_loss": 0.62697256, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64612532, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.5504748821258545 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01113545, + "balance_loss_clip": 1.00196683, + "balance_loss_mlp": 1.00053501, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.6900357072163796, + "language_loss": 0.74731112, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76997399, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.511918306350708 + }, + { + "auxiliary_loss_clip": 0.01152422, + "auxiliary_loss_mlp": 0.01112451, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00048935, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.4244709546252714, + "language_loss": 0.80849564, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83114439, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.5988667011260986 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.00747741, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00102448, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.905386561513853, + "language_loss": 0.70172954, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72054559, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.6095683574676514 + }, + { + "auxiliary_loss_clip": 0.01118357, + "auxiliary_loss_mlp": 0.01112171, + "balance_loss_clip": 1.00184381, + "balance_loss_mlp": 1.00059152, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 3.6869423633132494, + "language_loss": 0.61353165, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63583696, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.8743512630462646 + }, + { + "auxiliary_loss_clip": 0.01150642, + "auxiliary_loss_mlp": 0.01113014, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.0005753, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.9293967295064078, + "language_loss": 0.72015011, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74278671, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.6017937660217285 + }, + { + "auxiliary_loss_clip": 0.01085179, + "auxiliary_loss_mlp": 0.01112757, + "balance_loss_clip": 1.00162852, + "balance_loss_mlp": 1.00060451, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.28725771582869, + "language_loss": 0.67798185, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.69996125, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.718395709991455 + }, + { + "auxiliary_loss_clip": 0.01148227, + "auxiliary_loss_mlp": 0.0109199, + "balance_loss_clip": 1.00152445, + "balance_loss_mlp": 1.00005555, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 1.0820672918518242, + "language_loss": 0.65278184, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67518401, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.164686918258667 + }, + { + "auxiliary_loss_clip": 0.01150613, + "auxiliary_loss_mlp": 0.01112369, + "balance_loss_clip": 1.00193894, + "balance_loss_mlp": 1.00040758, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.646022290815538, + "language_loss": 0.7503348, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7729646, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.5444438457489014 + }, + { + "auxiliary_loss_clip": 0.0115053, + "auxiliary_loss_mlp": 0.01111522, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00041914, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.677284465671968, + "language_loss": 0.71459591, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73721647, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.537693500518799 + }, + { + "auxiliary_loss_clip": 0.01136097, + "auxiliary_loss_mlp": 0.01112066, + "balance_loss_clip": 1.00165379, + "balance_loss_mlp": 1.0005815, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.7098817995471778, + "language_loss": 0.76462221, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78710383, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.63277268409729 + }, + { + "auxiliary_loss_clip": 0.01116469, + "auxiliary_loss_mlp": 0.01111519, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00060678, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 2.7220859452860773, + "language_loss": 0.7501266, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77240646, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.6324734687805176 + }, + { + "auxiliary_loss_clip": 0.01133954, + "auxiliary_loss_mlp": 0.01113213, + "balance_loss_clip": 1.00206304, + "balance_loss_mlp": 1.00077462, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 4.350270306673711, + "language_loss": 0.77266526, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.79513693, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 4.007476806640625 + }, + { + "auxiliary_loss_clip": 0.01135533, + "auxiliary_loss_mlp": 0.01112537, + "balance_loss_clip": 1.00191724, + "balance_loss_mlp": 1.00057554, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 2.930063114391599, + "language_loss": 0.79671967, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81920034, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.5712430477142334 + }, + { + "auxiliary_loss_clip": 0.01150756, + "auxiliary_loss_mlp": 0.01111876, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.00058198, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.7573550877399857, + "language_loss": 0.71313608, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73576236, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.6068592071533203 + }, + { + "auxiliary_loss_clip": 0.01152711, + "auxiliary_loss_mlp": 0.01112571, + "balance_loss_clip": 1.00195348, + "balance_loss_mlp": 1.00051475, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.7412536146244564, + "language_loss": 0.70542026, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72807312, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.6843838691711426 + }, + { + "auxiliary_loss_clip": 0.01137035, + "auxiliary_loss_mlp": 0.01111132, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.00060081, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.6948362275199178, + "language_loss": 0.64035761, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66283923, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.619339942932129 + }, + { + "auxiliary_loss_clip": 0.01167418, + "auxiliary_loss_mlp": 0.01112398, + "balance_loss_clip": 1.00205123, + "balance_loss_mlp": 1.00062704, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 6.39840899547927, + "language_loss": 0.80766177, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83045983, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.5238664150238037 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.0006845, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.623852039025101, + "language_loss": 0.79157102, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81403744, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 4.066529035568237 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.0111181, + "balance_loss_clip": 1.00163364, + "balance_loss_mlp": 1.00042081, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.060280178331483, + "language_loss": 0.72083414, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74312592, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.6319732666015625 + }, + { + "auxiliary_loss_clip": 0.01137784, + "auxiliary_loss_mlp": 0.01112712, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00046468, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 5.114861859696209, + "language_loss": 0.75079262, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77329761, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 3.9942641258239746 + }, + { + "auxiliary_loss_clip": 0.01135977, + "auxiliary_loss_mlp": 0.01112918, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.00067019, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.710100062957273, + "language_loss": 0.7607426, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78323156, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.58935546875 + }, + { + "auxiliary_loss_clip": 0.01133836, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00071716, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 2.0182280665991414, + "language_loss": 0.82155478, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84401035, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.6219067573547363 + }, + { + "auxiliary_loss_clip": 0.01150443, + "auxiliary_loss_mlp": 0.01111617, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00060964, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7572467857644432, + "language_loss": 0.74240488, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.7650255, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.6377651691436768 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01113497, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00058222, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.5229546733096986, + "language_loss": 0.72742826, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74994135, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.600647449493408 + }, + { + "auxiliary_loss_clip": 0.01146768, + "auxiliary_loss_mlp": 0.01091908, + "balance_loss_clip": 1.00161862, + "balance_loss_mlp": 0.99997407, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7453882467405721, + "language_loss": 0.55356592, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57595265, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.294285774230957 + }, + { + "auxiliary_loss_clip": 0.01152426, + "auxiliary_loss_mlp": 0.01112633, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.0005765, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.5540189498457657, + "language_loss": 0.82722068, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84987134, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 3.934797525405884 + }, + { + "auxiliary_loss_clip": 0.01167551, + "auxiliary_loss_mlp": 0.01113148, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.0006144, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.614629592163672, + "language_loss": 0.75137901, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77418602, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.545055866241455 + }, + { + "auxiliary_loss_clip": 0.01105466, + "auxiliary_loss_mlp": 0.01111347, + "balance_loss_clip": 1.00187933, + "balance_loss_mlp": 1.00062537, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.8742899986352415, + "language_loss": 0.69923186, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72140002, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.708937168121338 + }, + { + "auxiliary_loss_clip": 0.01105877, + "auxiliary_loss_mlp": 0.00747625, + "balance_loss_clip": 1.00190747, + "balance_loss_mlp": 1.00096107, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.5506229118992991, + "language_loss": 0.68485349, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70338845, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.733450174331665 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01111551, + "balance_loss_clip": 1.00203383, + "balance_loss_mlp": 1.00063825, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.8661569073562667, + "language_loss": 0.71572518, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73851418, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.5132291316986084 + }, + { + "auxiliary_loss_clip": 0.01150548, + "auxiliary_loss_mlp": 0.01112307, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00072753, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.5117527421050787, + "language_loss": 0.69254637, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71517503, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.576920509338379 + }, + { + "auxiliary_loss_clip": 0.01167436, + "auxiliary_loss_mlp": 0.01112123, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00054348, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.8644686863868538, + "language_loss": 0.85824227, + "learning_rate": 1.767821335237733e-06, + "loss": 0.88103789, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.551344633102417 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.0004729, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.6994132763930128, + "language_loss": 0.80455112, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.8268714, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.626520872116089 + }, + { + "auxiliary_loss_clip": 0.01134076, + "auxiliary_loss_mlp": 0.01112483, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00052118, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.671730549052603, + "language_loss": 0.73488289, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75734842, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.604283094406128 + }, + { + "auxiliary_loss_clip": 0.01152204, + "auxiliary_loss_mlp": 0.01111849, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.0005548, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.7595259281521602, + "language_loss": 0.79218388, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81482446, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.529390335083008 + }, + { + "auxiliary_loss_clip": 0.01119931, + "auxiliary_loss_mlp": 0.01112226, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00055122, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 1.8882646745198295, + "language_loss": 0.76733708, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78965867, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.644946813583374 + }, + { + "auxiliary_loss_clip": 0.01152486, + "auxiliary_loss_mlp": 0.01111706, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00050712, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 1.942335518321052, + "language_loss": 0.8038466, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82648849, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.606081962585449 + }, + { + "auxiliary_loss_clip": 0.01150623, + "auxiliary_loss_mlp": 0.01113508, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00078356, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.6936675975639126, + "language_loss": 0.69184595, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71448731, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.708258628845215 + }, + { + "auxiliary_loss_clip": 0.01150572, + "auxiliary_loss_mlp": 0.01111984, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00049901, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.003031655636783, + "language_loss": 0.85424423, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.8768698, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.581568479537964 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.0109281, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00011253, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7840185343304832, + "language_loss": 0.59882057, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.62105703, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.2155561447143555 + }, + { + "auxiliary_loss_clip": 0.01120427, + "auxiliary_loss_mlp": 0.01112543, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00077224, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.484482225942451, + "language_loss": 0.70678616, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72911578, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.6302638053894043 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.01112615, + "balance_loss_clip": 1.00195849, + "balance_loss_mlp": 1.00074935, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.6502276821076256, + "language_loss": 0.75751364, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78031349, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.5135691165924072 + }, + { + "auxiliary_loss_clip": 0.01116717, + "auxiliary_loss_mlp": 0.01111635, + "balance_loss_clip": 1.00174296, + "balance_loss_mlp": 1.00062692, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.6793821160495672, + "language_loss": 0.74748123, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76976478, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.7090957164764404 + }, + { + "auxiliary_loss_clip": 0.01136327, + "auxiliary_loss_mlp": 0.01111813, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00061488, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.5593598215578837, + "language_loss": 0.72388756, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74636889, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.01150648, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00068426, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.6692044453799895, + "language_loss": 0.69495112, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71758687, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.546574592590332 + }, + { + "auxiliary_loss_clip": 0.0115238, + "auxiliary_loss_mlp": 0.01112376, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00060582, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.4865594788089151, + "language_loss": 0.70832813, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73097575, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.598944902420044 + }, + { + "auxiliary_loss_clip": 0.01150172, + "auxiliary_loss_mlp": 0.01112524, + "balance_loss_clip": 1.00188959, + "balance_loss_mlp": 1.00065804, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.5555425038388324, + "language_loss": 0.80250615, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82513309, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01086162, + "auxiliary_loss_mlp": 0.01113012, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00066924, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.444247973669707, + "language_loss": 0.74840057, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77039224, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.7302803993225098 + }, + { + "auxiliary_loss_clip": 0.01151744, + "auxiliary_loss_mlp": 0.01112582, + "balance_loss_clip": 1.00201046, + "balance_loss_mlp": 1.00071573, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.5770427498664772, + "language_loss": 0.70324385, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72588712, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.6953907012939453 + }, + { + "auxiliary_loss_clip": 0.01152461, + "auxiliary_loss_mlp": 0.01112709, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00065231, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.757552509727527, + "language_loss": 0.67475432, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69740605, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.547030210494995 + }, + { + "auxiliary_loss_clip": 0.01167455, + "auxiliary_loss_mlp": 0.01113568, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.0007484, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.869544569902569, + "language_loss": 0.79607785, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81888807, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 3.8989522457122803 + }, + { + "auxiliary_loss_clip": 0.01119374, + "auxiliary_loss_mlp": 0.01112833, + "balance_loss_clip": 1.00165176, + "balance_loss_mlp": 1.00049007, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 2.0791678372462377, + "language_loss": 0.83158886, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85391098, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.628628969192505 + }, + { + "auxiliary_loss_clip": 0.01135183, + "auxiliary_loss_mlp": 0.0111195, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.000561, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.2714698948261827, + "language_loss": 0.6738894, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69636071, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.01150628, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00045729, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.3866106670871403, + "language_loss": 0.76193404, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78455591, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.5854156017303467 + }, + { + "auxiliary_loss_clip": 0.01120656, + "auxiliary_loss_mlp": 0.01112668, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00080192, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.5990671475831593, + "language_loss": 0.73637664, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75870991, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.7123498916625977 + }, + { + "auxiliary_loss_clip": 0.01119067, + "auxiliary_loss_mlp": 0.01113417, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.0006932, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 1.982585819877094, + "language_loss": 0.66663229, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68895715, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 4.176778793334961 + }, + { + "auxiliary_loss_clip": 0.01133703, + "auxiliary_loss_mlp": 0.01113224, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00059509, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.4834917305144566, + "language_loss": 0.78143513, + "learning_rate": 1.758153413657318e-06, + "loss": 0.80390441, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.561724901199341 + }, + { + "auxiliary_loss_clip": 0.01135992, + "auxiliary_loss_mlp": 0.01113125, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.0005914, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.7108364952455657, + "language_loss": 0.81755304, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.8400442, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 4.0243260860443115 + }, + { + "auxiliary_loss_clip": 0.01150867, + "auxiliary_loss_mlp": 0.007477, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00112987, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.4108924075333142, + "language_loss": 0.76409006, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78307581, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.590071678161621 + }, + { + "auxiliary_loss_clip": 0.01167675, + "auxiliary_loss_mlp": 0.01114053, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00066066, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.1414409008029307, + "language_loss": 0.79393405, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81675136, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.472980499267578 + }, + { + "auxiliary_loss_clip": 0.01089673, + "auxiliary_loss_mlp": 0.01112162, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00058246, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 1.7120830112239742, + "language_loss": 0.68801939, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71003771, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.6892576217651367 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01111455, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.0007329, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.4698653939772224, + "language_loss": 0.77469987, + "learning_rate": 1.756220509823588e-06, + "loss": 0.7973305, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.5610623359680176 + }, + { + "auxiliary_loss_clip": 0.0112059, + "auxiliary_loss_mlp": 0.01112803, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00065064, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.463537227097254, + "language_loss": 0.78470707, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80704099, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.644559621810913 + }, + { + "auxiliary_loss_clip": 0.01117277, + "auxiliary_loss_mlp": 0.01113203, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00057423, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.716192923861141, + "language_loss": 0.69792616, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72023094, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 4.140957832336426 + }, + { + "auxiliary_loss_clip": 0.01135819, + "auxiliary_loss_mlp": 0.01112962, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00061917, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 1.8945663439640732, + "language_loss": 0.74509835, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76758617, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 2.565063953399658 + }, + { + "auxiliary_loss_clip": 0.0115278, + "auxiliary_loss_mlp": 0.01112939, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00078666, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5891137940297198, + "language_loss": 0.76705766, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78971481, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.5852301120758057 + }, + { + "auxiliary_loss_clip": 0.01135648, + "auxiliary_loss_mlp": 0.01111563, + "balance_loss_clip": 1.00178206, + "balance_loss_mlp": 1.00046015, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.5078178833052547, + "language_loss": 0.75901836, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78149045, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.786720037460327 + }, + { + "auxiliary_loss_clip": 0.01167375, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.0004518, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.4121119321561384, + "language_loss": 0.7915315, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81432939, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.536884307861328 + }, + { + "auxiliary_loss_clip": 0.01118874, + "auxiliary_loss_mlp": 0.01112205, + "balance_loss_clip": 1.0015533, + "balance_loss_mlp": 1.00052977, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.7604109956784337, + "language_loss": 0.64123917, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66354996, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.607936382293701 + }, + { + "auxiliary_loss_clip": 0.01134086, + "auxiliary_loss_mlp": 0.01112714, + "balance_loss_clip": 1.00193846, + "balance_loss_mlp": 1.00056219, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.4844217531428263, + "language_loss": 0.66221815, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68468612, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.629808187484741 + }, + { + "auxiliary_loss_clip": 0.0115258, + "auxiliary_loss_mlp": 0.01113581, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00066578, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 1.8919415874160375, + "language_loss": 0.60482013, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62748176, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.536635398864746 + }, + { + "auxiliary_loss_clip": 0.01150577, + "auxiliary_loss_mlp": 0.00747658, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00102067, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.653325075065568, + "language_loss": 0.64253485, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.6615172, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.5507307052612305 + }, + { + "auxiliary_loss_clip": 0.0115076, + "auxiliary_loss_mlp": 0.01112946, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00050735, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5349966014870167, + "language_loss": 0.63661337, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65925044, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.548419952392578 + }, + { + "auxiliary_loss_clip": 0.01150495, + "auxiliary_loss_mlp": 0.01112004, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00051999, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 2.0755853469975034, + "language_loss": 0.7729069, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79553187, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.5688109397888184 + }, + { + "auxiliary_loss_clip": 0.01103838, + "auxiliary_loss_mlp": 0.0111155, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.00063753, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.384316491339248, + "language_loss": 0.72630024, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74845409, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.7894179821014404 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01112286, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00070632, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.8701481063980534, + "language_loss": 0.75542808, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.7776081, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.6600852012634277 + }, + { + "auxiliary_loss_clip": 0.0111734, + "auxiliary_loss_mlp": 0.01113901, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00050902, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4132666547620203, + "language_loss": 0.62223089, + "learning_rate": 1.750423192272189e-06, + "loss": 0.64454329, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.571284294128418 + }, + { + "auxiliary_loss_clip": 0.01167546, + "auxiliary_loss_mlp": 0.01113399, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0006752, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.0538851498161015, + "language_loss": 0.64012015, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66292959, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.478079080581665 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01113015, + "balance_loss_clip": 1.0017761, + "balance_loss_mlp": 1.00076783, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.7798057566862122, + "language_loss": 0.82781875, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.8501476, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.645017147064209 + }, + { + "auxiliary_loss_clip": 0.01137066, + "auxiliary_loss_mlp": 0.0111276, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00051296, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.5282460649288607, + "language_loss": 0.72491485, + "learning_rate": 1.74926398270663e-06, + "loss": 0.7474131, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.6316230297088623 + }, + { + "auxiliary_loss_clip": 0.01120556, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00058937, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.8999151854153027, + "language_loss": 0.66711217, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68945372, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.638214588165283 + }, + { + "auxiliary_loss_clip": 0.01120357, + "auxiliary_loss_mlp": 0.01113771, + "balance_loss_clip": 1.00171828, + "balance_loss_mlp": 1.00037885, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.293606038767326, + "language_loss": 0.51551735, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53785866, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.713549852371216 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01113389, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00047421, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9304509402223855, + "language_loss": 0.85754657, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.8798728, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.599695920944214 + }, + { + "auxiliary_loss_clip": 0.0115205, + "auxiliary_loss_mlp": 0.01112244, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00056863, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.6000365373792662, + "language_loss": 0.70287871, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72552162, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.572190761566162 + }, + { + "auxiliary_loss_clip": 0.01135964, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_clip": 1.00204754, + "balance_loss_mlp": 1.00052667, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5332602606800056, + "language_loss": 0.73474634, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75723362, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.6156933307647705 + }, + { + "auxiliary_loss_clip": 0.01133492, + "auxiliary_loss_mlp": 0.01112167, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00058663, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.7237719085359753, + "language_loss": 0.71294379, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73540038, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.597586154937744 + }, + { + "auxiliary_loss_clip": 0.01150876, + "auxiliary_loss_mlp": 0.01111485, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00047743, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.772990813995262, + "language_loss": 0.78246319, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80508679, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.668346643447876 + }, + { + "auxiliary_loss_clip": 0.01119611, + "auxiliary_loss_mlp": 0.01112969, + "balance_loss_clip": 1.00172544, + "balance_loss_mlp": 1.00062633, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.4889588829712783, + "language_loss": 0.7260958, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74842155, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 4.024396896362305 + }, + { + "auxiliary_loss_clip": 0.01151157, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.698257083317501, + "language_loss": 0.71513283, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73778254, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.5353543758392334 + }, + { + "auxiliary_loss_clip": 0.0116723, + "auxiliary_loss_mlp": 0.01111691, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00049245, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.5799910305378817, + "language_loss": 0.79465312, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.8174423, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.515280246734619 + }, + { + "auxiliary_loss_clip": 0.01120051, + "auxiliary_loss_mlp": 0.01113494, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00048375, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.574283650602609, + "language_loss": 0.83296412, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85529959, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.633664846420288 + }, + { + "auxiliary_loss_clip": 0.01119696, + "auxiliary_loss_mlp": 0.00747881, + "balance_loss_clip": 1.00185144, + "balance_loss_mlp": 1.00104499, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.5606407762342875, + "language_loss": 0.75293905, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77161479, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.7144103050231934 + }, + { + "auxiliary_loss_clip": 0.01135495, + "auxiliary_loss_mlp": 0.01112278, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00060296, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 2.3673910757942846, + "language_loss": 0.82215196, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84462976, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 4.033726453781128 + }, + { + "auxiliary_loss_clip": 0.01151046, + "auxiliary_loss_mlp": 0.01113774, + "balance_loss_clip": 1.00206411, + "balance_loss_mlp": 1.00076365, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 1.8949145761011954, + "language_loss": 0.57276422, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59541249, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.5408897399902344 + }, + { + "auxiliary_loss_clip": 0.01151623, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00061297, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5205711145179859, + "language_loss": 0.67845023, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.7010951, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 3.9375150203704834 + }, + { + "auxiliary_loss_clip": 0.01119058, + "auxiliary_loss_mlp": 0.01111629, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00062156, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.2655224671971605, + "language_loss": 0.74224186, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76454866, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.6409881114959717 + }, + { + "auxiliary_loss_clip": 0.01120192, + "auxiliary_loss_mlp": 0.01113417, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00050235, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.71522901021034, + "language_loss": 0.73426926, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75660533, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.6236493587493896 + }, + { + "auxiliary_loss_clip": 0.01167418, + "auxiliary_loss_mlp": 0.01112976, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00063372, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.8213398079080985, + "language_loss": 0.75872266, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.78152668, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.479635000228882 + }, + { + "auxiliary_loss_clip": 0.01150303, + "auxiliary_loss_mlp": 0.00747806, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00105286, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3797684291420627, + "language_loss": 0.68681568, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70579678, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.5401642322540283 + }, + { + "auxiliary_loss_clip": 0.01103985, + "auxiliary_loss_mlp": 0.01113235, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00051022, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0189643936083974, + "language_loss": 0.6821835, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70435572, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.685253381729126 + }, + { + "auxiliary_loss_clip": 0.01167666, + "auxiliary_loss_mlp": 0.01113406, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00049138, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.5696887813448817, + "language_loss": 0.77860391, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80141461, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.558647632598877 + }, + { + "auxiliary_loss_clip": 0.01121006, + "auxiliary_loss_mlp": 0.01111408, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00049567, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.5226958789291498, + "language_loss": 0.82720768, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84953183, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 4.0591747760772705 + }, + { + "auxiliary_loss_clip": 0.01152628, + "auxiliary_loss_mlp": 0.01113531, + "balance_loss_clip": 1.00194228, + "balance_loss_mlp": 1.00071144, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 1.8816215500894906, + "language_loss": 0.75117445, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77383602, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.533919334411621 + }, + { + "auxiliary_loss_clip": 0.01135409, + "auxiliary_loss_mlp": 0.01111856, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00056207, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 1.8400259095322475, + "language_loss": 0.65131223, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.67378491, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.5892646312713623 + }, + { + "auxiliary_loss_clip": 0.01106609, + "auxiliary_loss_mlp": 0.01113858, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00065696, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.5394374887940823, + "language_loss": 0.68130559, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70351022, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.6405632495880127 + }, + { + "auxiliary_loss_clip": 0.01167142, + "auxiliary_loss_mlp": 0.01111669, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00047064, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 1.693529555694017, + "language_loss": 0.86407262, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88686073, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.5441133975982666 + }, + { + "auxiliary_loss_clip": 0.01151422, + "auxiliary_loss_mlp": 0.01112094, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.6971046667208183, + "language_loss": 0.73332775, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75596297, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.551105499267578 + }, + { + "auxiliary_loss_clip": 0.01152533, + "auxiliary_loss_mlp": 0.01113027, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00049388, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5917681267699963, + "language_loss": 0.78096217, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80361778, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.794614315032959 + }, + { + "auxiliary_loss_clip": 0.01134147, + "auxiliary_loss_mlp": 0.01111516, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00050855, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.507223016944521, + "language_loss": 0.79842925, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82088584, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.5922298431396484 + }, + { + "auxiliary_loss_clip": 0.0113371, + "auxiliary_loss_mlp": 0.01112034, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.0005492, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.6597010710498652, + "language_loss": 0.65100592, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67346334, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.6113169193267822 + }, + { + "auxiliary_loss_clip": 0.01150644, + "auxiliary_loss_mlp": 0.00747864, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00121284, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 2.128783227769636, + "language_loss": 0.73124456, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.7502296, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.510615825653076 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01112254, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.000579, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.9388326653343517, + "language_loss": 0.63402134, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.65651143, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.6225781440734863 + }, + { + "auxiliary_loss_clip": 0.01134316, + "auxiliary_loss_mlp": 0.00747904, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00116467, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.9846311173982116, + "language_loss": 0.75135255, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77017474, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.605956792831421 + }, + { + "auxiliary_loss_clip": 0.01135557, + "auxiliary_loss_mlp": 0.01111389, + "balance_loss_clip": 1.00183284, + "balance_loss_mlp": 1.00047684, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.2480564600796105, + "language_loss": 0.74590933, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76837873, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.591585636138916 + }, + { + "auxiliary_loss_clip": 0.01136233, + "auxiliary_loss_mlp": 0.01113772, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00066626, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 1.8943973095067776, + "language_loss": 0.80006278, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82256275, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.605973720550537 + }, + { + "auxiliary_loss_clip": 0.01167342, + "auxiliary_loss_mlp": 0.0111249, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00071955, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.7539660744338563, + "language_loss": 0.74004102, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76283932, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.4966132640838623 + }, + { + "auxiliary_loss_clip": 0.01133954, + "auxiliary_loss_mlp": 0.01112759, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00060654, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 2.5982273926991333, + "language_loss": 0.76169205, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78415912, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.551687717437744 + }, + { + "auxiliary_loss_clip": 0.01100421, + "auxiliary_loss_mlp": 0.01091942, + "balance_loss_clip": 1.00159431, + "balance_loss_mlp": 1.00000787, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8646970109085687, + "language_loss": 0.59453404, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61645758, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.3438539505004883 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01112784, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00053668, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8458225120366516, + "language_loss": 0.80374205, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82654291, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.511547565460205 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01113679, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.00057292, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 1.9843861625958885, + "language_loss": 0.68604755, + "learning_rate": 1.733816187358836e-06, + "loss": 0.7085197, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.550938844680786 + }, + { + "auxiliary_loss_clip": 0.01150532, + "auxiliary_loss_mlp": 0.01111867, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00057316, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.5573736394429776, + "language_loss": 0.75370246, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77632642, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.612771511077881 + }, + { + "auxiliary_loss_clip": 0.01152312, + "auxiliary_loss_mlp": 0.01112878, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00063014, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5956615718588794, + "language_loss": 0.72224283, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74489468, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.617997407913208 + }, + { + "auxiliary_loss_clip": 0.01120743, + "auxiliary_loss_mlp": 0.01111454, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00054145, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.764870389528481, + "language_loss": 0.82963085, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85195291, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.6402158737182617 + }, + { + "auxiliary_loss_clip": 0.01132868, + "auxiliary_loss_mlp": 0.0109192, + "balance_loss_clip": 1.0015893, + "balance_loss_mlp": 0.99998587, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8686540860552253, + "language_loss": 0.64834267, + "learning_rate": 1.732272280610387e-06, + "loss": 0.67059058, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.0133216381073 + }, + { + "auxiliary_loss_clip": 0.01152143, + "auxiliary_loss_mlp": 0.01111996, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00060678, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.6813158141832167, + "language_loss": 0.69452423, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71716559, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.5637214183807373 + }, + { + "auxiliary_loss_clip": 0.01116973, + "auxiliary_loss_mlp": 0.01111107, + "balance_loss_clip": 1.00173604, + "balance_loss_mlp": 1.00076699, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.5092437777194905, + "language_loss": 0.76031619, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78259695, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 4.1164162158966064 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.0111181, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00061131, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 1.9732831899557541, + "language_loss": 0.61073041, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63285279, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.6320478916168213 + }, + { + "auxiliary_loss_clip": 0.0111874, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_clip": 1.0016849, + "balance_loss_mlp": 1.00051689, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.7006597951216873, + "language_loss": 0.79033506, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.8126511, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.6561667919158936 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01112339, + "balance_loss_clip": 1.00167489, + "balance_loss_mlp": 1.000664, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.8505562898927945, + "language_loss": 0.81839913, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84086454, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.6049039363861084 + }, + { + "auxiliary_loss_clip": 0.01167476, + "auxiliary_loss_mlp": 0.01112759, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.00060725, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4021244465051304, + "language_loss": 0.69151819, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71432054, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.502389430999756 + }, + { + "auxiliary_loss_clip": 0.01113283, + "auxiliary_loss_mlp": 0.01091584, + "balance_loss_clip": 1.0013113, + "balance_loss_mlp": 1.00003135, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7564866841193588, + "language_loss": 0.61103976, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63308835, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 4.580052137374878 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.01113427, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00060761, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.5852355001206948, + "language_loss": 0.64819229, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.67084634, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.5908870697021484 + }, + { + "auxiliary_loss_clip": 0.01137547, + "auxiliary_loss_mlp": 0.01112989, + "balance_loss_clip": 1.0019511, + "balance_loss_mlp": 1.00064635, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.5205843971904853, + "language_loss": 0.73224449, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75474983, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 4.098703861236572 + }, + { + "auxiliary_loss_clip": 0.01120351, + "auxiliary_loss_mlp": 0.01112205, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00062501, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.7998597222227373, + "language_loss": 0.75942099, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78174657, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.603731393814087 + }, + { + "auxiliary_loss_clip": 0.01133924, + "auxiliary_loss_mlp": 0.01111311, + "balance_loss_clip": 1.00174761, + "balance_loss_mlp": 1.00049388, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.4348993848159592, + "language_loss": 0.70843738, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73088974, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.6103131771087646 + }, + { + "auxiliary_loss_clip": 0.01135898, + "auxiliary_loss_mlp": 0.01111911, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00061691, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7014080720285853, + "language_loss": 0.6838187, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70629674, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.5999197959899902 + }, + { + "auxiliary_loss_clip": 0.01151802, + "auxiliary_loss_mlp": 0.01111544, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00063169, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.7088852672054837, + "language_loss": 0.7441408, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76677424, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.5494658946990967 + }, + { + "auxiliary_loss_clip": 0.01150914, + "auxiliary_loss_mlp": 0.00747701, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.0010848, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 1.887579244521242, + "language_loss": 0.75073105, + "learning_rate": 1.726869892322104e-06, + "loss": 0.76971722, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.5774118900299072 + }, + { + "auxiliary_loss_clip": 0.0112019, + "auxiliary_loss_mlp": 0.0111187, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00057626, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.7345240652524836, + "language_loss": 0.83048922, + "learning_rate": 1.726484084647256e-06, + "loss": 0.85280979, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.6675803661346436 + }, + { + "auxiliary_loss_clip": 0.01104421, + "auxiliary_loss_mlp": 0.01112629, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 1.8033052293605722, + "language_loss": 0.79401898, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81618947, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 4.034447431564331 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.0111295, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.000512, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.7184381350024238, + "language_loss": 0.90206575, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92455357, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.578695058822632 + }, + { + "auxiliary_loss_clip": 0.01118873, + "auxiliary_loss_mlp": 0.01111699, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00050116, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.8310939940880433, + "language_loss": 0.84028292, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86258864, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.6123111248016357 + }, + { + "auxiliary_loss_clip": 0.0115236, + "auxiliary_loss_mlp": 0.011122, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00071549, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.041809977117414, + "language_loss": 0.74386895, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76651454, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.571331024169922 + }, + { + "auxiliary_loss_clip": 0.01133796, + "auxiliary_loss_mlp": 0.01112501, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00063539, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 3.1819505708250664, + "language_loss": 0.77821362, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.80067658, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.537292003631592 + }, + { + "auxiliary_loss_clip": 0.01134186, + "auxiliary_loss_mlp": 0.01112535, + "balance_loss_clip": 1.0017432, + "balance_loss_mlp": 1.00057352, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.620899602896328, + "language_loss": 0.74933898, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77180624, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.5388996601104736 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01110734, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00048959, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.6247899301244506, + "language_loss": 0.7511704, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77363992, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 2.5841307640075684 + }, + { + "auxiliary_loss_clip": 0.01167168, + "auxiliary_loss_mlp": 0.01111796, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00059795, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.5152282370013794, + "language_loss": 0.71765411, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74044371, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.497966766357422 + }, + { + "auxiliary_loss_clip": 0.01117445, + "auxiliary_loss_mlp": 0.01111798, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00050426, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.448427589817047, + "language_loss": 0.75628835, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77858078, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.6639175415039062 + }, + { + "auxiliary_loss_clip": 0.01135597, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00055552, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.7381120908202463, + "language_loss": 0.67657673, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69904453, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.598785400390625 + }, + { + "auxiliary_loss_clip": 0.01152588, + "auxiliary_loss_mlp": 0.01111826, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.00072312, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.5027674462134877, + "language_loss": 0.73106158, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75370574, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.6118061542510986 + }, + { + "auxiliary_loss_clip": 0.01120344, + "auxiliary_loss_mlp": 0.00747719, + "balance_loss_clip": 1.00192893, + "balance_loss_mlp": 1.00104129, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.361732754308839, + "language_loss": 0.75174278, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77042341, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.6377220153808594 + }, + { + "auxiliary_loss_clip": 0.01085125, + "auxiliary_loss_mlp": 0.01111006, + "balance_loss_clip": 1.00167298, + "balance_loss_mlp": 1.00047565, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.722591319318059, + "language_loss": 0.66332507, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68528634, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.7084405422210693 + }, + { + "auxiliary_loss_clip": 0.01116715, + "auxiliary_loss_mlp": 0.01110702, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00045741, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.0169191574374894, + "language_loss": 0.83112741, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85340166, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.6477739810943604 + }, + { + "auxiliary_loss_clip": 0.0113324, + "auxiliary_loss_mlp": 0.01112069, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00048864, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.478496821526025, + "language_loss": 0.85389698, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87635005, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.6178455352783203 + }, + { + "auxiliary_loss_clip": 0.01134099, + "auxiliary_loss_mlp": 0.01111938, + "balance_loss_clip": 1.001652, + "balance_loss_mlp": 1.0007391, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.202654756244375, + "language_loss": 0.73669028, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75915062, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.649768590927124 + }, + { + "auxiliary_loss_clip": 0.0116725, + "auxiliary_loss_mlp": 0.01111025, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00039899, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.7260530569090553, + "language_loss": 0.73950052, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76228327, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.5691730976104736 + }, + { + "auxiliary_loss_clip": 0.01121056, + "auxiliary_loss_mlp": 0.01112636, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00057936, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.5195946751924703, + "language_loss": 0.74956441, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77190131, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.7067344188690186 + }, + { + "auxiliary_loss_clip": 0.01136265, + "auxiliary_loss_mlp": 0.0111258, + "balance_loss_clip": 1.00207531, + "balance_loss_mlp": 1.00071383, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.276557861032467, + "language_loss": 0.7754609, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79794931, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.599339008331299 + }, + { + "auxiliary_loss_clip": 0.01119074, + "auxiliary_loss_mlp": 0.011135, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00077534, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 2.014316356490823, + "language_loss": 0.61563867, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63796437, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.7152702808380127 + }, + { + "auxiliary_loss_clip": 0.01104518, + "auxiliary_loss_mlp": 0.01111932, + "balance_loss_clip": 1.0018028, + "balance_loss_mlp": 1.0005424, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 3.238740937445816, + "language_loss": 0.68270075, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70486522, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.763089179992676 + }, + { + "auxiliary_loss_clip": 0.0111981, + "auxiliary_loss_mlp": 0.01111417, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.00069594, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 1.683497621837803, + "language_loss": 0.83965242, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.8619647, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.663525104522705 + }, + { + "auxiliary_loss_clip": 0.01135896, + "auxiliary_loss_mlp": 0.01110938, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.00069332, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.2423552199111216, + "language_loss": 0.73595566, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75842398, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.658323049545288 + }, + { + "auxiliary_loss_clip": 0.0113308, + "auxiliary_loss_mlp": 0.01111313, + "balance_loss_clip": 1.00204861, + "balance_loss_mlp": 1.00059104, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.800907965921558, + "language_loss": 0.72497666, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74742061, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 4.066075801849365 + }, + { + "auxiliary_loss_clip": 0.01133778, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00107753, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.0615908004946877, + "language_loss": 0.68401194, + "learning_rate": 1.716842301625806e-06, + "loss": 0.7028265, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.7284233570098877 + }, + { + "auxiliary_loss_clip": 0.01167242, + "auxiliary_loss_mlp": 0.01111373, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00055563, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.4992889247244874, + "language_loss": 0.80735779, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.83014393, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.537604331970215 + }, + { + "auxiliary_loss_clip": 0.01150478, + "auxiliary_loss_mlp": 0.01111577, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.00056958, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.6080471333572195, + "language_loss": 0.65405309, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67667365, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.5728304386138916 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.01112919, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00076747, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.5334895248899691, + "language_loss": 0.75411916, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77641875, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.591754913330078 + }, + { + "auxiliary_loss_clip": 0.01132524, + "auxiliary_loss_mlp": 0.0109193, + "balance_loss_clip": 1.00147974, + "balance_loss_mlp": 0.99999613, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6815388533036876, + "language_loss": 0.52476048, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54700506, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 4.623938083648682 + }, + { + "auxiliary_loss_clip": 0.01152313, + "auxiliary_loss_mlp": 0.01110604, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00064528, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.7171731133265968, + "language_loss": 0.68709332, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70972252, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.679312229156494 + }, + { + "auxiliary_loss_clip": 0.01105871, + "auxiliary_loss_mlp": 0.01112366, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00088203, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 5.877795987588385, + "language_loss": 0.81858963, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84077203, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 2.6684296131134033 + }, + { + "auxiliary_loss_clip": 0.01167225, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00052643, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.209065768063599, + "language_loss": 0.67449105, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69728053, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 3.922429084777832 + }, + { + "auxiliary_loss_clip": 0.01122671, + "auxiliary_loss_mlp": 0.01112451, + "balance_loss_clip": 1.00203264, + "balance_loss_mlp": 1.00048923, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.6290233574313524, + "language_loss": 0.70799673, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73034787, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.6573712825775146 + }, + { + "auxiliary_loss_clip": 0.01088991, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00055289, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.4992516854510995, + "language_loss": 0.72662461, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74862254, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.7529795169830322 + }, + { + "auxiliary_loss_clip": 0.01150424, + "auxiliary_loss_mlp": 0.0111211, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00052977, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 2.1599363102815357, + "language_loss": 0.78021324, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80283856, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.5828664302825928 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01110912, + "balance_loss_clip": 1.00156069, + "balance_loss_mlp": 1.00057173, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.9069099825318772, + "language_loss": 0.69579995, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71793067, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.663292646408081 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00003481, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9068302098001991, + "language_loss": 0.60279369, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62503242, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.29140305519104 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01110855, + "balance_loss_clip": 1.00207293, + "balance_loss_mlp": 1.0007056, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.5448637021572709, + "language_loss": 0.73984873, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76245672, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.7117552757263184 + }, + { + "auxiliary_loss_clip": 0.01089185, + "auxiliary_loss_mlp": 0.01112065, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00058031, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.7021806189406559, + "language_loss": 0.69841123, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.7204237, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 4.1889283657073975 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01112794, + "balance_loss_clip": 1.00185001, + "balance_loss_mlp": 1.00054693, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.9523227012538242, + "language_loss": 0.75299346, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77546227, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.6119096279144287 + }, + { + "auxiliary_loss_clip": 0.01150691, + "auxiliary_loss_mlp": 0.0111358, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00056934, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.0292348668578586, + "language_loss": 0.69432229, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71696502, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.624168634414673 + }, + { + "auxiliary_loss_clip": 0.01150522, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_clip": 1.00191224, + "balance_loss_mlp": 1.00054526, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.756960492123788, + "language_loss": 0.72319299, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74581563, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.5416696071624756 + }, + { + "auxiliary_loss_clip": 0.01116755, + "auxiliary_loss_mlp": 0.01111857, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00056362, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.883593239016605, + "language_loss": 0.89345884, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91574496, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.608510971069336 + }, + { + "auxiliary_loss_clip": 0.01120561, + "auxiliary_loss_mlp": 0.01112617, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00084603, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.3847185950677274, + "language_loss": 0.77639651, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79872823, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.663423538208008 + }, + { + "auxiliary_loss_clip": 0.01118975, + "auxiliary_loss_mlp": 0.01111716, + "balance_loss_clip": 1.00156856, + "balance_loss_mlp": 1.00042176, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.5820228314998899, + "language_loss": 0.70633066, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72863758, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.7008841037750244 + }, + { + "auxiliary_loss_clip": 0.01136015, + "auxiliary_loss_mlp": 0.01113015, + "balance_loss_clip": 1.00182736, + "balance_loss_mlp": 1.00067234, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.6860438163673044, + "language_loss": 0.66367865, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68616891, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.6460344791412354 + }, + { + "auxiliary_loss_clip": 0.01120497, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00042081, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.3369616592609592, + "language_loss": 0.86697561, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.889292, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.6400394439697266 + }, + { + "auxiliary_loss_clip": 0.01150886, + "auxiliary_loss_mlp": 0.01112751, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00059867, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.5306078805257273, + "language_loss": 0.77346766, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79610407, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.591887950897217 + }, + { + "auxiliary_loss_clip": 0.01150312, + "auxiliary_loss_mlp": 0.01111504, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00059128, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.5702973042311668, + "language_loss": 0.76193446, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78455257, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.589737892150879 + }, + { + "auxiliary_loss_clip": 0.01150474, + "auxiliary_loss_mlp": 0.01111132, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00069642, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.5002655662775173, + "language_loss": 0.85368156, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87629771, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.629554033279419 + }, + { + "auxiliary_loss_clip": 0.01148007, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_clip": 1.00145173, + "balance_loss_mlp": 1.00003564, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.744946939252114, + "language_loss": 0.52586329, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54825926, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 2.942978858947754 + }, + { + "auxiliary_loss_clip": 0.01135491, + "auxiliary_loss_mlp": 0.01111228, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00050604, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.2938716105607084, + "language_loss": 0.74231315, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76478028, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.623861789703369 + }, + { + "auxiliary_loss_clip": 0.01167267, + "auxiliary_loss_mlp": 0.01112259, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00048852, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 2.085687112748403, + "language_loss": 0.73827422, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76106942, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.6367993354797363 + }, + { + "auxiliary_loss_clip": 0.01134168, + "auxiliary_loss_mlp": 0.01112008, + "balance_loss_clip": 1.00171292, + "balance_loss_mlp": 1.00042796, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.4233620537635019, + "language_loss": 0.61356223, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.636024, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.602104425430298 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01112825, + "balance_loss_clip": 1.00174451, + "balance_loss_mlp": 1.0006727, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.856655639539795, + "language_loss": 0.87586564, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89803016, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.6392931938171387 + }, + { + "auxiliary_loss_clip": 0.01135478, + "auxiliary_loss_mlp": 0.01112347, + "balance_loss_clip": 1.00184381, + "balance_loss_mlp": 1.00048113, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6733557806844583, + "language_loss": 0.73808146, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76055974, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.586987257003784 + }, + { + "auxiliary_loss_clip": 0.01137203, + "auxiliary_loss_mlp": 0.01112971, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.0004369, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.7297848648377334, + "language_loss": 0.7810868, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80358851, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.5876412391662598 + }, + { + "auxiliary_loss_clip": 0.01150874, + "auxiliary_loss_mlp": 0.01112309, + "balance_loss_clip": 1.00212049, + "balance_loss_mlp": 1.00044346, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.4647897170532442, + "language_loss": 0.78279674, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.8054285, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.5854814052581787 + }, + { + "auxiliary_loss_clip": 0.01167292, + "auxiliary_loss_mlp": 0.01111295, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00047779, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.4135357824531019, + "language_loss": 0.73685932, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75964522, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.538222551345825 + }, + { + "auxiliary_loss_clip": 0.01133775, + "auxiliary_loss_mlp": 0.00748015, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00133681, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.4830003625402437, + "language_loss": 0.83665293, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.8554709, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.5909862518310547 + }, + { + "auxiliary_loss_clip": 0.01163505, + "auxiliary_loss_mlp": 0.01092022, + "balance_loss_clip": 1.00150728, + "balance_loss_mlp": 1.00008798, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7193026940556327, + "language_loss": 0.57890517, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.60146046, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 4.55370306968689 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01111323, + "balance_loss_clip": 1.00170612, + "balance_loss_mlp": 1.00060129, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.6795943744741135, + "language_loss": 0.82000846, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84214193, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.6829941272735596 + }, + { + "auxiliary_loss_clip": 0.01152875, + "auxiliary_loss_mlp": 0.01113701, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00059545, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 2.5808047491724415, + "language_loss": 0.82006264, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.84272838, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.533614158630371 + }, + { + "auxiliary_loss_clip": 0.0116719, + "auxiliary_loss_mlp": 0.01112173, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00049818, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.5267980286710034, + "language_loss": 0.72561908, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.74841267, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.5217249393463135 + }, + { + "auxiliary_loss_clip": 0.01135686, + "auxiliary_loss_mlp": 0.01111993, + "balance_loss_clip": 1.00207579, + "balance_loss_mlp": 1.00050902, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7009263810105253, + "language_loss": 0.70762241, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.7300992, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.5394794940948486 + }, + { + "auxiliary_loss_clip": 0.01135926, + "auxiliary_loss_mlp": 0.01112208, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00053263, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7702556703226344, + "language_loss": 0.7740624, + "learning_rate": 1.701044410566205e-06, + "loss": 0.79654372, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 4.065215587615967 + }, + { + "auxiliary_loss_clip": 0.011505, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.266610278975987, + "language_loss": 0.64138079, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.6640057, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.6458191871643066 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_clip": 1.00136065, + "balance_loss_mlp": 1.00017977, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8873993192620325, + "language_loss": 0.62586284, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64808369, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.1205685138702393 + }, + { + "auxiliary_loss_clip": 0.01120939, + "auxiliary_loss_mlp": 0.01112157, + "balance_loss_clip": 1.0018295, + "balance_loss_mlp": 1.00057733, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.8847062978103573, + "language_loss": 0.65512705, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67745799, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 4.132895231246948 + }, + { + "auxiliary_loss_clip": 0.01152024, + "auxiliary_loss_mlp": 0.01112106, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00052619, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.6816656654512312, + "language_loss": 0.70140249, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72404385, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.523834466934204 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01111891, + "balance_loss_clip": 1.00198734, + "balance_loss_mlp": 1.00059772, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.6256342160238186, + "language_loss": 0.77473724, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79704851, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.632507085800171 + }, + { + "auxiliary_loss_clip": 0.01103424, + "auxiliary_loss_mlp": 0.01112164, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00058365, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.6945263027094108, + "language_loss": 0.79513276, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81728864, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.6685194969177246 + }, + { + "auxiliary_loss_clip": 0.01120363, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00053525, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.149828267054027, + "language_loss": 0.76557124, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78789312, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.608931064605713 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01112477, + "balance_loss_clip": 1.00178373, + "balance_loss_mlp": 1.00061083, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 2.038543295392841, + "language_loss": 0.68770498, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70985395, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.6672475337982178 + }, + { + "auxiliary_loss_clip": 0.01167299, + "auxiliary_loss_mlp": 0.01111525, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00051761, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2350298417797982, + "language_loss": 0.66687083, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68965906, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.5586137771606445 + }, + { + "auxiliary_loss_clip": 0.01133915, + "auxiliary_loss_mlp": 0.01112857, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00041842, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.8756442766871728, + "language_loss": 0.87192547, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89439315, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 3.974360227584839 + }, + { + "auxiliary_loss_clip": 0.0113522, + "auxiliary_loss_mlp": 0.01112385, + "balance_loss_clip": 1.00195467, + "balance_loss_mlp": 1.00070953, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.33302211715475, + "language_loss": 0.5923084, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61478448, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.620361328125 + }, + { + "auxiliary_loss_clip": 0.01150921, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00061083, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.7878310730525087, + "language_loss": 0.69344479, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71608162, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.509448289871216 + }, + { + "auxiliary_loss_clip": 0.01101862, + "auxiliary_loss_mlp": 0.01112873, + "balance_loss_clip": 1.00176442, + "balance_loss_mlp": 1.0004344, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9255078449603653, + "language_loss": 0.79213703, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81428438, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.6532726287841797 + }, + { + "auxiliary_loss_clip": 0.01087211, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_clip": 1.00161207, + "balance_loss_mlp": 1.00052953, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 1.861180370358246, + "language_loss": 0.66728508, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.68927449, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.7309181690216064 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01112951, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.0005132, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 1.859710840677032, + "language_loss": 0.78661066, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80878341, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.624884605407715 + }, + { + "auxiliary_loss_clip": 0.01135649, + "auxiliary_loss_mlp": 0.00747871, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.0011884, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.4554477922081124, + "language_loss": 0.59028906, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.60912424, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.596226215362549 + }, + { + "auxiliary_loss_clip": 0.01152181, + "auxiliary_loss_mlp": 0.01110491, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00053215, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.25075437186742, + "language_loss": 0.72063774, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74326444, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 2.5929765701293945 + }, + { + "auxiliary_loss_clip": 0.0113342, + "auxiliary_loss_mlp": 0.0111254, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00057817, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 3.709771170448062, + "language_loss": 0.75658476, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77904433, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 2.5201773643493652 + }, + { + "auxiliary_loss_clip": 0.01119, + "auxiliary_loss_mlp": 0.01113254, + "balance_loss_clip": 1.00178707, + "balance_loss_mlp": 1.00062525, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.7439804785478477, + "language_loss": 0.72746575, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74978828, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.638529062271118 + }, + { + "auxiliary_loss_clip": 0.01150742, + "auxiliary_loss_mlp": 0.01111526, + "balance_loss_clip": 1.00178862, + "balance_loss_mlp": 1.00061405, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.542867096985879, + "language_loss": 0.73369801, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75632071, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.6365277767181396 + }, + { + "auxiliary_loss_clip": 0.01167206, + "auxiliary_loss_mlp": 0.01112062, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00057745, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 2.0265666430193874, + "language_loss": 0.82889849, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85169113, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.492043972015381 + }, + { + "auxiliary_loss_clip": 0.01150385, + "auxiliary_loss_mlp": 0.0111152, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.0005126, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 3.0627919206436007, + "language_loss": 0.72372472, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74634373, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.5863444805145264 + }, + { + "auxiliary_loss_clip": 0.01167222, + "auxiliary_loss_mlp": 0.01112117, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00072742, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 1.5914378857522187, + "language_loss": 0.77701342, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79980683, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.587601900100708 + }, + { + "auxiliary_loss_clip": 0.01133706, + "auxiliary_loss_mlp": 0.01112544, + "balance_loss_clip": 1.00178039, + "balance_loss_mlp": 1.00048709, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.8022620010783277, + "language_loss": 0.70320636, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72566885, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.6165919303894043 + }, + { + "auxiliary_loss_clip": 0.01100829, + "auxiliary_loss_mlp": 0.01091758, + "balance_loss_clip": 1.00150633, + "balance_loss_mlp": 1.00020552, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7739380693850785, + "language_loss": 0.55583203, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57775795, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.1699507236480713 + }, + { + "auxiliary_loss_clip": 0.011331, + "auxiliary_loss_mlp": 0.0111138, + "balance_loss_clip": 1.00182045, + "balance_loss_mlp": 1.00075424, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.4125674183590486, + "language_loss": 0.81842256, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84086728, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.583977222442627 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.00747786, + "balance_loss_clip": 1.00185549, + "balance_loss_mlp": 1.0012176, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.528562283267803, + "language_loss": 0.74362469, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76228666, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.7885758876800537 + }, + { + "auxiliary_loss_clip": 0.0115195, + "auxiliary_loss_mlp": 0.0111195, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00056136, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.8041263688982703, + "language_loss": 0.82753992, + "learning_rate": 1.690266496731839e-06, + "loss": 0.8501789, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.6129403114318848 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.01111602, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00068974, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 1.9142284819017559, + "language_loss": 0.65097851, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67328233, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.6059305667877197 + }, + { + "auxiliary_loss_clip": 0.0113597, + "auxiliary_loss_mlp": 0.01113276, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00064731, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 2.9253474324840636, + "language_loss": 0.8104648, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83295727, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.5826175212860107 + }, + { + "auxiliary_loss_clip": 0.01167119, + "auxiliary_loss_mlp": 0.0111112, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00058901, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 2.088030973562107, + "language_loss": 0.73223853, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75502092, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.5324504375457764 + }, + { + "auxiliary_loss_clip": 0.01129778, + "auxiliary_loss_mlp": 0.0109223, + "balance_loss_clip": 1.00122285, + "balance_loss_mlp": 1.00029564, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6229332073123126, + "language_loss": 0.53440881, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55662894, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.2563507556915283 + }, + { + "auxiliary_loss_clip": 0.01167172, + "auxiliary_loss_mlp": 0.01111815, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00061703, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.6039562241741296, + "language_loss": 0.68872803, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71151793, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 3.931443452835083 + }, + { + "auxiliary_loss_clip": 0.01120167, + "auxiliary_loss_mlp": 0.01111577, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00056911, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 2.417012772779124, + "language_loss": 0.75934052, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78165793, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.6826422214508057 + }, + { + "auxiliary_loss_clip": 0.01133376, + "auxiliary_loss_mlp": 0.01112851, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00060296, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 1.8397774649907386, + "language_loss": 0.75202191, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77448416, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.5489513874053955 + }, + { + "auxiliary_loss_clip": 0.01150241, + "auxiliary_loss_mlp": 0.01111139, + "balance_loss_clip": 1.00176442, + "balance_loss_mlp": 1.00070381, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 3.2462875434406273, + "language_loss": 0.76324201, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78585589, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.529806137084961 + }, + { + "auxiliary_loss_clip": 0.01135491, + "auxiliary_loss_mlp": 0.01111723, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00061953, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 1.7492921650530946, + "language_loss": 0.71406281, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73653495, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.558164119720459 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01112298, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00052786, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3674063522482696, + "language_loss": 0.82569903, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.8480128, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 4.0636656284332275 + }, + { + "auxiliary_loss_clip": 0.01151797, + "auxiliary_loss_mlp": 0.01110721, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00038052, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.6508082627881837, + "language_loss": 0.66222763, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68485284, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.588834047317505 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.00747762, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00114584, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 1.9967458980635315, + "language_loss": 0.80845392, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.8271147, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.5733203887939453 + }, + { + "auxiliary_loss_clip": 0.01135117, + "auxiliary_loss_mlp": 0.01112285, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00060976, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.3494159203594804, + "language_loss": 0.69206721, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.7145412, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 4.183910608291626 + }, + { + "auxiliary_loss_clip": 0.01118623, + "auxiliary_loss_mlp": 0.01111268, + "balance_loss_clip": 1.00206912, + "balance_loss_mlp": 1.00054669, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.335592720008546, + "language_loss": 0.74951458, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.77181345, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.633408546447754 + }, + { + "auxiliary_loss_clip": 0.01167362, + "auxiliary_loss_mlp": 0.01113718, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00061226, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.341710976475072, + "language_loss": 0.82353705, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84634781, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.487696886062622 + }, + { + "auxiliary_loss_clip": 0.01135499, + "auxiliary_loss_mlp": 0.01111275, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00064886, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 1.9372245849217689, + "language_loss": 0.71402419, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73649192, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.639674425125122 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01111994, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00050926, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 1.9849663391237762, + "language_loss": 0.73917699, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76133668, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.6204984188079834 + }, + { + "auxiliary_loss_clip": 0.01106327, + "auxiliary_loss_mlp": 0.01112176, + "balance_loss_clip": 1.00194228, + "balance_loss_mlp": 1.00069189, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 1.7436796548079894, + "language_loss": 0.7200464, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74223149, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.733848810195923 + }, + { + "auxiliary_loss_clip": 0.01163432, + "auxiliary_loss_mlp": 0.01091597, + "balance_loss_clip": 1.00146317, + "balance_loss_mlp": 1.0000447, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7488328898108121, + "language_loss": 0.54444641, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56699669, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.2335267066955566 + }, + { + "auxiliary_loss_clip": 0.01150386, + "auxiliary_loss_mlp": 0.01112428, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00056219, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.645136113044744, + "language_loss": 0.7086156, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73124373, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 3.9131433963775635 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01111563, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00045991, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 2.2112722371153093, + "language_loss": 0.75758922, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78004134, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.5915029048919678 + }, + { + "auxiliary_loss_clip": 0.01152356, + "auxiliary_loss_mlp": 0.01111447, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00063026, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 1.7734524591608167, + "language_loss": 0.82029319, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84293121, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.5248610973358154 + }, + { + "auxiliary_loss_clip": 0.01150343, + "auxiliary_loss_mlp": 0.01112902, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.0006541, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.3708698307349145, + "language_loss": 0.69578499, + "learning_rate": 1.681420084607516e-06, + "loss": 0.71841741, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.520799398422241 + }, + { + "auxiliary_loss_clip": 0.01150558, + "auxiliary_loss_mlp": 0.01112119, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.0005393, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.4254182561512276, + "language_loss": 0.74524295, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76786977, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.677351713180542 + }, + { + "auxiliary_loss_clip": 0.01150488, + "auxiliary_loss_mlp": 0.01110374, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00051105, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.9204930036718577, + "language_loss": 0.82201344, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84462208, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.555198907852173 + }, + { + "auxiliary_loss_clip": 0.01120608, + "auxiliary_loss_mlp": 0.01112752, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00079036, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.2056964160177612, + "language_loss": 0.63644159, + "learning_rate": 1.680266672116467e-06, + "loss": 0.65877521, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.5898895263671875 + }, + { + "auxiliary_loss_clip": 0.0113358, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00058842, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.7034318830813, + "language_loss": 0.91906369, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94151068, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.5529046058654785 + }, + { + "auxiliary_loss_clip": 0.01150973, + "auxiliary_loss_mlp": 0.01113318, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00068927, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 3.0204117767384586, + "language_loss": 0.59859848, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62124133, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.5889222621917725 + }, + { + "auxiliary_loss_clip": 0.01103493, + "auxiliary_loss_mlp": 0.01111964, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.00057435, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.234752605028855, + "language_loss": 0.81439155, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83654606, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 2.6686878204345703 + }, + { + "auxiliary_loss_clip": 0.01133666, + "auxiliary_loss_mlp": 0.0111178, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00058174, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.663555533764626, + "language_loss": 0.87358022, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89603466, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.5671603679656982 + }, + { + "auxiliary_loss_clip": 0.01152608, + "auxiliary_loss_mlp": 0.01110991, + "balance_loss_clip": 1.00207257, + "balance_loss_mlp": 1.00065124, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.83299407117026, + "language_loss": 0.85061705, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.87325305, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.5120389461517334 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.01092016, + "balance_loss_clip": 1.00142002, + "balance_loss_mlp": 1.00008225, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.8058515088910426, + "language_loss": 0.58295864, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60534602, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.1235973834991455 + }, + { + "auxiliary_loss_clip": 0.01133615, + "auxiliary_loss_mlp": 0.01111466, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.000458, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 1.9416408062267299, + "language_loss": 0.70220733, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72465813, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.606325149536133 + }, + { + "auxiliary_loss_clip": 0.01117951, + "auxiliary_loss_mlp": 0.01112425, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00065422, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 2.413764554264151, + "language_loss": 0.67075932, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69306314, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.6391358375549316 + }, + { + "auxiliary_loss_clip": 0.01132265, + "auxiliary_loss_mlp": 0.01091728, + "balance_loss_clip": 1.00145888, + "balance_loss_mlp": 1.00017548, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7755780741109766, + "language_loss": 0.58129358, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60353351, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.1015355587005615 + }, + { + "auxiliary_loss_clip": 0.01105204, + "auxiliary_loss_mlp": 0.01112221, + "balance_loss_clip": 1.00162029, + "balance_loss_mlp": 1.00054598, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8523948442802172, + "language_loss": 0.73832679, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.76050103, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.733038902282715 + }, + { + "auxiliary_loss_clip": 0.01118821, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_clip": 1.00177896, + "balance_loss_mlp": 1.00046873, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 1.7689153148794805, + "language_loss": 0.60990417, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63221574, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.6934869289398193 + }, + { + "auxiliary_loss_clip": 0.01118585, + "auxiliary_loss_mlp": 0.01110984, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00054836, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.866916802082064, + "language_loss": 0.80838966, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83068532, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.7197022438049316 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01111255, + "balance_loss_clip": 1.00167823, + "balance_loss_mlp": 1.00053334, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.6856199041540028, + "language_loss": 0.7782836, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80045307, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.832157850265503 + }, + { + "auxiliary_loss_clip": 0.01105977, + "auxiliary_loss_mlp": 0.0111125, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00043321, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.4501866089595739, + "language_loss": 0.68635273, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70852506, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.7090017795562744 + }, + { + "auxiliary_loss_clip": 0.01133908, + "auxiliary_loss_mlp": 0.01110427, + "balance_loss_clip": 1.00204134, + "balance_loss_mlp": 1.00056398, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 1.8725599146666283, + "language_loss": 0.67192703, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69437039, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.579617977142334 + }, + { + "auxiliary_loss_clip": 0.01135666, + "auxiliary_loss_mlp": 0.01110387, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00061905, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.5217762344431596, + "language_loss": 0.74513853, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76759905, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 4.069579601287842 + }, + { + "auxiliary_loss_clip": 0.01102296, + "auxiliary_loss_mlp": 0.01112643, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00058603, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.5958355278236203, + "language_loss": 0.79581928, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81796861, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.698707342147827 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01110753, + "balance_loss_clip": 1.0017767, + "balance_loss_mlp": 1.00060368, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.2908212991092234, + "language_loss": 0.71046841, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73277432, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.7375402450561523 + }, + { + "auxiliary_loss_clip": 0.01102105, + "auxiliary_loss_mlp": 0.01111789, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00068569, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.02919763404883, + "language_loss": 0.81127095, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83340997, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.01118618, + "auxiliary_loss_mlp": 0.01111626, + "balance_loss_clip": 1.00191438, + "balance_loss_mlp": 1.00052321, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.6882858030264978, + "language_loss": 0.78542572, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80772817, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.6209075450897217 + }, + { + "auxiliary_loss_clip": 0.01167056, + "auxiliary_loss_mlp": 0.01112314, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00063872, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 1.9309815679522961, + "language_loss": 0.83813322, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.86092693, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.477083206176758 + }, + { + "auxiliary_loss_clip": 0.01150419, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.00050581, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.8560646626338655, + "language_loss": 0.67072093, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69334698, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 3.977621078491211 + }, + { + "auxiliary_loss_clip": 0.01150111, + "auxiliary_loss_mlp": 0.01110299, + "balance_loss_clip": 1.00185215, + "balance_loss_mlp": 1.00053144, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.493655407208025, + "language_loss": 0.58275199, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.6053561, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.5906007289886475 + }, + { + "auxiliary_loss_clip": 0.01070705, + "auxiliary_loss_mlp": 0.01110575, + "balance_loss_clip": 1.00170326, + "balance_loss_mlp": 1.00052047, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.5050240952777791, + "language_loss": 0.69493401, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71674681, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 4.109966993331909 + }, + { + "auxiliary_loss_clip": 0.01068407, + "auxiliary_loss_mlp": 0.0111068, + "balance_loss_clip": 1.00161409, + "balance_loss_mlp": 1.00043511, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.4833890327949535, + "language_loss": 0.78180385, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80359477, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 2.7354750633239746 + }, + { + "auxiliary_loss_clip": 0.01131176, + "auxiliary_loss_mlp": 0.01091578, + "balance_loss_clip": 1.00137115, + "balance_loss_mlp": 1.00002563, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6984648134076742, + "language_loss": 0.49162233, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51384985, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.2905447483062744 + }, + { + "auxiliary_loss_clip": 0.01150582, + "auxiliary_loss_mlp": 0.00747919, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.0012989, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.7062569481902004, + "language_loss": 0.62514651, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64413154, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 2.6359074115753174 + }, + { + "auxiliary_loss_clip": 0.01135448, + "auxiliary_loss_mlp": 0.01112083, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.0005033, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.5480725010743466, + "language_loss": 0.68885034, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71132565, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.591712713241577 + }, + { + "auxiliary_loss_clip": 0.01152348, + "auxiliary_loss_mlp": 0.01112163, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.0003922, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.6935307788936451, + "language_loss": 0.64716774, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.6698128, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.599135637283325 + }, + { + "auxiliary_loss_clip": 0.01063083, + "auxiliary_loss_mlp": 0.0109236, + "balance_loss_clip": 1.00130796, + "balance_loss_mlp": 1.0004257, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7135644347676137, + "language_loss": 0.59670556, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61826003, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.494077682495117 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.00182021, + "balance_loss_mlp": 1.00123262, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.5729459313913952, + "language_loss": 0.74237132, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76121688, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 5.045876741409302 + }, + { + "auxiliary_loss_clip": 0.01118719, + "auxiliary_loss_mlp": 0.01112716, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.0006597, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.7360278521789179, + "language_loss": 0.73259187, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75490624, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.6777195930480957 + }, + { + "auxiliary_loss_clip": 0.01150338, + "auxiliary_loss_mlp": 0.01110886, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00083232, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.5329125033365454, + "language_loss": 0.81671381, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83932602, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.5925300121307373 + }, + { + "auxiliary_loss_clip": 0.01133869, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00060105, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.7800008008487598, + "language_loss": 0.80728835, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82974696, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.575770616531372 + }, + { + "auxiliary_loss_clip": 0.0116723, + "auxiliary_loss_mlp": 0.00747837, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00112867, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.9092741085999707, + "language_loss": 0.78361827, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80276895, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.5818710327148438 + }, + { + "auxiliary_loss_clip": 0.01134061, + "auxiliary_loss_mlp": 0.01112218, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00063813, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 2.088775848192032, + "language_loss": 0.5865624, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.60902524, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.5757997035980225 + }, + { + "auxiliary_loss_clip": 0.0115041, + "auxiliary_loss_mlp": 0.01111847, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00074375, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.7037527706442435, + "language_loss": 0.81910765, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84173024, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.575214147567749 + }, + { + "auxiliary_loss_clip": 0.01167147, + "auxiliary_loss_mlp": 0.01112124, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00073433, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.978193518190859, + "language_loss": 0.862252, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88504481, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.5121910572052 + }, + { + "auxiliary_loss_clip": 0.01137192, + "auxiliary_loss_mlp": 0.0111256, + "balance_loss_clip": 1.00195622, + "balance_loss_mlp": 1.00059819, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 3.0288006029061054, + "language_loss": 0.73876452, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.76126206, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.6061837673187256 + }, + { + "auxiliary_loss_clip": 0.01134694, + "auxiliary_loss_mlp": 0.00747832, + "balance_loss_clip": 1.00181317, + "balance_loss_mlp": 1.00113511, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 2.7578815495560383, + "language_loss": 0.75128514, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77011043, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.571699380874634 + }, + { + "auxiliary_loss_clip": 0.01167107, + "auxiliary_loss_mlp": 0.01112909, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00056601, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.9165552981547849, + "language_loss": 0.72262883, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74542898, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.531728506088257 + }, + { + "auxiliary_loss_clip": 0.01104218, + "auxiliary_loss_mlp": 0.01110036, + "balance_loss_clip": 1.00174356, + "balance_loss_mlp": 1.00045907, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.5495505094228736, + "language_loss": 0.73546696, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75760949, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 2.648918628692627 + }, + { + "auxiliary_loss_clip": 0.01103454, + "auxiliary_loss_mlp": 0.01112, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00051546, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.4478578909826998, + "language_loss": 0.77660453, + "learning_rate": 1.663746609539197e-06, + "loss": 0.79875904, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.69826602935791 + }, + { + "auxiliary_loss_clip": 0.01167266, + "auxiliary_loss_mlp": 0.01114479, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00061035, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.7645238913401102, + "language_loss": 0.63514626, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65796369, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.506535768508911 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.01111018, + "balance_loss_clip": 1.0019182, + "balance_loss_mlp": 1.00048709, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.895645288857235, + "language_loss": 0.66712505, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68975687, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.581258773803711 + }, + { + "auxiliary_loss_clip": 0.01135342, + "auxiliary_loss_mlp": 0.00747894, + "balance_loss_clip": 1.00172544, + "balance_loss_mlp": 1.00118852, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.7383533646588292, + "language_loss": 0.71640247, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73523486, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.633723020553589 + }, + { + "auxiliary_loss_clip": 0.01167197, + "auxiliary_loss_mlp": 0.01113459, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00073504, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.465797823343708, + "language_loss": 0.73833704, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76114357, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.6282289028167725 + }, + { + "auxiliary_loss_clip": 0.01150852, + "auxiliary_loss_mlp": 0.0111204, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00074625, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 1.7274905533931901, + "language_loss": 0.61077058, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63339949, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.5889453887939453 + }, + { + "auxiliary_loss_clip": 0.01134894, + "auxiliary_loss_mlp": 0.01111217, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.0004952, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.436763957582545, + "language_loss": 0.7507093, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77317047, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.6496403217315674 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01112572, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00070548, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8056363964205897, + "language_loss": 0.83872795, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.86118948, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.5917556285858154 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01113211, + "balance_loss_clip": 1.00158572, + "balance_loss_mlp": 1.00058162, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 1.9230802399277622, + "language_loss": 0.75603396, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77837092, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.622138500213623 + }, + { + "auxiliary_loss_clip": 0.01103636, + "auxiliary_loss_mlp": 0.01112473, + "balance_loss_clip": 1.00177443, + "balance_loss_mlp": 1.00070214, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.8452781065211965, + "language_loss": 0.82294369, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.84510481, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.642970323562622 + }, + { + "auxiliary_loss_clip": 0.01120027, + "auxiliary_loss_mlp": 0.01110736, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00068235, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.8452109256156344, + "language_loss": 0.74555457, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.7678622, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.607410192489624 + }, + { + "auxiliary_loss_clip": 0.01133324, + "auxiliary_loss_mlp": 0.01113015, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00057662, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.8790218532111214, + "language_loss": 0.77374899, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79621232, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 3.9455196857452393 + }, + { + "auxiliary_loss_clip": 0.01118691, + "auxiliary_loss_mlp": 0.01112859, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00089729, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.712300049904027, + "language_loss": 0.81144738, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83376288, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.609577178955078 + }, + { + "auxiliary_loss_clip": 0.01167074, + "auxiliary_loss_mlp": 0.01111862, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00047326, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.51739442443251, + "language_loss": 0.70935482, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73214412, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.6028687953948975 + }, + { + "auxiliary_loss_clip": 0.01118374, + "auxiliary_loss_mlp": 0.01112885, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00063753, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.9780225464091432, + "language_loss": 0.73398811, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75630069, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.642700433731079 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01112066, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.0005815, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.7399670031745564, + "language_loss": 0.74958879, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77206874, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.635927438735962 + }, + { + "auxiliary_loss_clip": 0.01103666, + "auxiliary_loss_mlp": 0.01113922, + "balance_loss_clip": 1.00178349, + "balance_loss_mlp": 1.00062513, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.1109588545286537, + "language_loss": 0.7648626, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78703845, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.6763508319854736 + }, + { + "auxiliary_loss_clip": 0.01136661, + "auxiliary_loss_mlp": 0.01112446, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00067532, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6619391003507622, + "language_loss": 0.74689186, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76938289, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 4.050280570983887 + }, + { + "auxiliary_loss_clip": 0.011369, + "auxiliary_loss_mlp": 0.01111445, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00072384, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 1.6131473230942288, + "language_loss": 0.66389793, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.68638134, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.603358030319214 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01114142, + "balance_loss_clip": 1.00166202, + "balance_loss_mlp": 1.00065482, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 2.540908326671045, + "language_loss": 0.71609235, + "learning_rate": 1.656454488573026e-06, + "loss": 0.73859298, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 4.098421335220337 + }, + { + "auxiliary_loss_clip": 0.01118961, + "auxiliary_loss_mlp": 0.01111716, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00070882, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.5381679957952163, + "language_loss": 0.7039144, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72622114, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.6187045574188232 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.00747659, + "balance_loss_clip": 1.0016489, + "balance_loss_mlp": 1.00112319, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.6590202877749667, + "language_loss": 0.69753671, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71603084, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.7167253494262695 + }, + { + "auxiliary_loss_clip": 0.01135426, + "auxiliary_loss_mlp": 0.01111105, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00057364, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 2.145615712650556, + "language_loss": 0.60635316, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62881851, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.607208251953125 + }, + { + "auxiliary_loss_clip": 0.01102565, + "auxiliary_loss_mlp": 0.01112732, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00058007, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.7935675013813484, + "language_loss": 0.73488724, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75704026, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.6936256885528564 + }, + { + "auxiliary_loss_clip": 0.01136108, + "auxiliary_loss_mlp": 0.01112056, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.0006671, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.6611392449408247, + "language_loss": 0.76360965, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.5762054920196533 + }, + { + "auxiliary_loss_clip": 0.01149843, + "auxiliary_loss_mlp": 0.01112423, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00065243, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.7449090827966298, + "language_loss": 0.66341513, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68603778, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 3.992554187774658 + }, + { + "auxiliary_loss_clip": 0.01150567, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_clip": 1.00186265, + "balance_loss_mlp": 1.00061655, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.104784672207542, + "language_loss": 0.68831527, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.71095335, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.553058385848999 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01111663, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00055981, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.123826053200966, + "language_loss": 0.76844913, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79077101, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.5832223892211914 + }, + { + "auxiliary_loss_clip": 0.01083687, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.00145578, + "balance_loss_mlp": 1.00066018, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 42.14188833577851, + "language_loss": 0.7164129, + "learning_rate": 1.65300196133547e-06, + "loss": 0.73836935, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.7647902965545654 + }, + { + "auxiliary_loss_clip": 0.01151388, + "auxiliary_loss_mlp": 0.01111619, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00061131, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.356572712122383, + "language_loss": 0.73098457, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.7536146, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.5878164768218994 + }, + { + "auxiliary_loss_clip": 0.01151664, + "auxiliary_loss_mlp": 0.0111216, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00048482, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8311965286547842, + "language_loss": 0.72922695, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75186515, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.535860061645508 + }, + { + "auxiliary_loss_clip": 0.01150415, + "auxiliary_loss_mlp": 0.01111472, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00055993, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.628864389889079, + "language_loss": 0.73773617, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76035506, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.5064330101013184 + }, + { + "auxiliary_loss_clip": 0.01151671, + "auxiliary_loss_mlp": 0.00747839, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00111806, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.672936816654855, + "language_loss": 0.83675718, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85575223, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.561366081237793 + }, + { + "auxiliary_loss_clip": 0.01136703, + "auxiliary_loss_mlp": 0.01111221, + "balance_loss_clip": 1.00176704, + "balance_loss_mlp": 1.00049973, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.8918257347577414, + "language_loss": 0.72177213, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74425137, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.610948324203491 + }, + { + "auxiliary_loss_clip": 0.01117139, + "auxiliary_loss_mlp": 0.01091185, + "balance_loss_clip": 1.00127602, + "balance_loss_mlp": 1.00001383, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7106801670173533, + "language_loss": 0.55346638, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.5755496, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.2737600803375244 + }, + { + "auxiliary_loss_clip": 0.01152503, + "auxiliary_loss_mlp": 0.01113291, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.0005666, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.0003969484015087, + "language_loss": 0.63862956, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66128743, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.546186923980713 + }, + { + "auxiliary_loss_clip": 0.01105368, + "auxiliary_loss_mlp": 0.01111935, + "balance_loss_clip": 1.00158036, + "balance_loss_mlp": 1.00054598, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.9286724450668813, + "language_loss": 0.79207897, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81425196, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.6806483268737793 + }, + { + "auxiliary_loss_clip": 0.01118637, + "auxiliary_loss_mlp": 0.01113672, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00056648, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.134099001343798, + "language_loss": 0.69516373, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.7174868, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.6033029556274414 + }, + { + "auxiliary_loss_clip": 0.01134364, + "auxiliary_loss_mlp": 0.01112271, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00059605, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.4958863370969038, + "language_loss": 0.74895227, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77141857, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.5993359088897705 + }, + { + "auxiliary_loss_clip": 0.01118663, + "auxiliary_loss_mlp": 0.01112014, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00071979, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.5824184472785592, + "language_loss": 0.57346392, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59577072, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.597079277038574 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01111013, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00057745, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.7438379088223301, + "language_loss": 0.73812282, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76044583, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.6243200302124023 + }, + { + "auxiliary_loss_clip": 0.01117446, + "auxiliary_loss_mlp": 0.01091795, + "balance_loss_clip": 1.00138378, + "balance_loss_mlp": 1.00024223, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.674413037407188, + "language_loss": 0.57599843, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59809089, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.215622901916504 + }, + { + "auxiliary_loss_clip": 0.01152328, + "auxiliary_loss_mlp": 0.01111949, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00065517, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.6170619321966522, + "language_loss": 0.53588432, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55852711, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.668607234954834 + }, + { + "auxiliary_loss_clip": 0.01167196, + "auxiliary_loss_mlp": 0.01112819, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00066733, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.503929177248599, + "language_loss": 0.79875088, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82155097, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.555457592010498 + }, + { + "auxiliary_loss_clip": 0.0113376, + "auxiliary_loss_mlp": 0.01112802, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00084102, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.6748787929631355, + "language_loss": 0.66605955, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.6885252, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.6015210151672363 + }, + { + "auxiliary_loss_clip": 0.01116823, + "auxiliary_loss_mlp": 0.01111935, + "balance_loss_clip": 1.00167978, + "balance_loss_mlp": 1.00064111, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.771323318628843, + "language_loss": 0.70922297, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73151052, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.6961326599121094 + }, + { + "auxiliary_loss_clip": 0.0112191, + "auxiliary_loss_mlp": 0.01110868, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00062346, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.560870184726984, + "language_loss": 0.69684434, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71917212, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.6340694427490234 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01111419, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00069714, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.5867695708932876, + "language_loss": 0.7130267, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73532903, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.01120691, + "auxiliary_loss_mlp": 0.00747843, + "balance_loss_clip": 1.00173283, + "balance_loss_mlp": 1.00115454, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 1.904357609225692, + "language_loss": 0.72060311, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.73928845, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 4.070229768753052 + }, + { + "auxiliary_loss_clip": 0.01151811, + "auxiliary_loss_mlp": 0.01112105, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00052547, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.5555215879846191, + "language_loss": 0.78233469, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80497384, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.535010576248169 + }, + { + "auxiliary_loss_clip": 0.01135367, + "auxiliary_loss_mlp": 0.01110718, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.00047338, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.5332185837529122, + "language_loss": 0.7777369, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80019778, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.6047632694244385 + }, + { + "auxiliary_loss_clip": 0.01133389, + "auxiliary_loss_mlp": 0.01111828, + "balance_loss_clip": 1.00169647, + "balance_loss_mlp": 1.0008204, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.4572829549286415, + "language_loss": 0.81085116, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83330333, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.5993857383728027 + }, + { + "auxiliary_loss_clip": 0.01167124, + "auxiliary_loss_mlp": 0.0074793, + "balance_loss_clip": 1.00187933, + "balance_loss_mlp": 1.00117123, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.067925572579028, + "language_loss": 0.61137617, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63052666, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.5620627403259277 + }, + { + "auxiliary_loss_clip": 0.01150415, + "auxiliary_loss_mlp": 0.01111716, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00061333, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.5928218412037731, + "language_loss": 0.65387821, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67649949, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.5577242374420166 + }, + { + "auxiliary_loss_clip": 0.0113346, + "auxiliary_loss_mlp": 0.010912, + "balance_loss_clip": 1.00135183, + "balance_loss_mlp": 1.00002897, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6649972911133418, + "language_loss": 0.48041815, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50266474, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 4.6399147510528564 + }, + { + "auxiliary_loss_clip": 0.01118145, + "auxiliary_loss_mlp": 0.0074788, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00119972, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.4036874523749265, + "language_loss": 0.85886997, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87753022, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.662759780883789 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01111508, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00050044, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.5238529388992716, + "language_loss": 0.79111481, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81341732, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 4.089622259140015 + }, + { + "auxiliary_loss_clip": 0.01134539, + "auxiliary_loss_mlp": 0.0111068, + "balance_loss_clip": 1.00175214, + "balance_loss_mlp": 1.00053096, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.6187281750046334, + "language_loss": 0.69771588, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72016805, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.5935537815093994 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01111014, + "balance_loss_clip": 1.00170934, + "balance_loss_mlp": 1.00057817, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.877224311913722, + "language_loss": 0.76219988, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78449082, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.6566853523254395 + }, + { + "auxiliary_loss_clip": 0.01112546, + "auxiliary_loss_mlp": 0.00745905, + "balance_loss_clip": 1.00123298, + "balance_loss_mlp": 1.00025356, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7964355589820716, + "language_loss": 0.57389176, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59247625, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.1425130367279053 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.00747889, + "balance_loss_clip": 1.00171113, + "balance_loss_mlp": 1.0011456, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.8112312684870957, + "language_loss": 0.71938765, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73820555, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.6239371299743652 + }, + { + "auxiliary_loss_clip": 0.01167187, + "auxiliary_loss_mlp": 0.0111187, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00048113, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5897668929131894, + "language_loss": 0.78064245, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80343306, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.4948275089263916 + }, + { + "auxiliary_loss_clip": 0.01167268, + "auxiliary_loss_mlp": 0.01112702, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00054955, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.4160209629003675, + "language_loss": 0.80749857, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.8302983, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.5658466815948486 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.01113537, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00081229, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 1.9119614878287818, + "language_loss": 0.66680533, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68899864, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 4.076609373092651 + }, + { + "auxiliary_loss_clip": 0.01167178, + "auxiliary_loss_mlp": 0.0111243, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00065947, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 1.7951463107939216, + "language_loss": 0.69538909, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71818519, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.5997958183288574 + }, + { + "auxiliary_loss_clip": 0.01150728, + "auxiliary_loss_mlp": 0.00747931, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00124252, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.6099132339946065, + "language_loss": 0.81228459, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83127117, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.6352343559265137 + }, + { + "auxiliary_loss_clip": 0.01167083, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00057483, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.552415858930174, + "language_loss": 0.66113973, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68392456, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.5302202701568604 + }, + { + "auxiliary_loss_clip": 0.01135793, + "auxiliary_loss_mlp": 0.01112755, + "balance_loss_clip": 1.00205398, + "balance_loss_mlp": 1.00069809, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.5078817981417292, + "language_loss": 0.7194981, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74198359, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.57376766204834 + }, + { + "auxiliary_loss_clip": 0.01120811, + "auxiliary_loss_mlp": 0.01112411, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00054491, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.6788907894165395, + "language_loss": 0.76175904, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78409129, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.671753168106079 + }, + { + "auxiliary_loss_clip": 0.01135697, + "auxiliary_loss_mlp": 0.01112038, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.0006485, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.590802224456838, + "language_loss": 0.74748915, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.76996654, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.5777201652526855 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01111677, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00057399, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 1.9912783135138947, + "language_loss": 0.82191366, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84437197, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.5991756916046143 + }, + { + "auxiliary_loss_clip": 0.01116699, + "auxiliary_loss_mlp": 0.01111215, + "balance_loss_clip": 1.00163901, + "balance_loss_mlp": 1.00068378, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.098517995438322, + "language_loss": 0.85849369, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88077283, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.601195812225342 + }, + { + "auxiliary_loss_clip": 0.01118996, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.0005188, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.8238073394593468, + "language_loss": 0.75203848, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77434176, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.6379966735839844 + }, + { + "auxiliary_loss_clip": 0.01166986, + "auxiliary_loss_mlp": 0.01111753, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00055456, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.5918828266320815, + "language_loss": 0.81929374, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84208107, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.4967498779296875 + }, + { + "auxiliary_loss_clip": 0.01118662, + "auxiliary_loss_mlp": 0.00747872, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00112903, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.7180781861467231, + "language_loss": 0.77625608, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.7949214, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.6324069499969482 + }, + { + "auxiliary_loss_clip": 0.01135565, + "auxiliary_loss_mlp": 0.01112299, + "balance_loss_clip": 1.00175214, + "balance_loss_mlp": 1.00062335, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.3651668163437989, + "language_loss": 0.68056524, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70304382, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.59814190864563 + }, + { + "auxiliary_loss_clip": 0.01167275, + "auxiliary_loss_mlp": 0.01112518, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00055671, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.4376945333744424, + "language_loss": 0.79941356, + "learning_rate": 1.634606741699593e-06, + "loss": 0.8222115, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.4727766513824463 + }, + { + "auxiliary_loss_clip": 0.01150122, + "auxiliary_loss_mlp": 0.01112098, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00061321, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.6267556965147867, + "language_loss": 0.71858287, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.7412051, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.526884078979492 + }, + { + "auxiliary_loss_clip": 0.01133628, + "auxiliary_loss_mlp": 0.01110917, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00057673, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.330851661996051, + "language_loss": 0.6926524, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71509784, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.621642589569092 + }, + { + "auxiliary_loss_clip": 0.01135529, + "auxiliary_loss_mlp": 0.01111609, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00069702, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.726042028345844, + "language_loss": 0.61405224, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.6365236, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.5531880855560303 + }, + { + "auxiliary_loss_clip": 0.01134728, + "auxiliary_loss_mlp": 0.01110662, + "balance_loss_clip": 1.00180495, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.321356588613676, + "language_loss": 0.75586742, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.77832133, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.5537960529327393 + }, + { + "auxiliary_loss_clip": 0.01148552, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_clip": 1.00145364, + "balance_loss_mlp": 1.00002468, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8884605408439796, + "language_loss": 0.66844964, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.69084716, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.096795082092285 + }, + { + "auxiliary_loss_clip": 0.01152408, + "auxiliary_loss_mlp": 0.01112541, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00077057, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 1.7940681595048704, + "language_loss": 0.81093967, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8335892, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.6589951515197754 + }, + { + "auxiliary_loss_clip": 0.01152277, + "auxiliary_loss_mlp": 0.01111826, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00062799, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.606610387132824, + "language_loss": 0.85695755, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.87959862, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.5698494911193848 + }, + { + "auxiliary_loss_clip": 0.01134091, + "auxiliary_loss_mlp": 0.01112234, + "balance_loss_clip": 1.00171363, + "balance_loss_mlp": 1.00046349, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.6408303876062735, + "language_loss": 0.87392211, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89638537, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.5608692169189453 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01111242, + "balance_loss_clip": 1.00161839, + "balance_loss_mlp": 1.00042522, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.6485631663731506, + "language_loss": 0.85393095, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87608272, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.707998275756836 + }, + { + "auxiliary_loss_clip": 0.01150123, + "auxiliary_loss_mlp": 0.01111092, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.000561, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8830829409468042, + "language_loss": 0.79108059, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81369275, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 3.912590742111206 + }, + { + "auxiliary_loss_clip": 0.0116702, + "auxiliary_loss_mlp": 0.01110976, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00044537, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.3861369262345935, + "language_loss": 0.82883841, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85161841, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.615861177444458 + }, + { + "auxiliary_loss_clip": 0.01133597, + "auxiliary_loss_mlp": 0.01111929, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00063562, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 1.8975350682675225, + "language_loss": 0.72002375, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74247909, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.638742446899414 + }, + { + "auxiliary_loss_clip": 0.0116698, + "auxiliary_loss_mlp": 0.00747806, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00117791, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.7544495535878022, + "language_loss": 0.78535783, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80450571, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.6023294925689697 + }, + { + "auxiliary_loss_clip": 0.01134958, + "auxiliary_loss_mlp": 0.0111079, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00064063, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.6230790209698236, + "language_loss": 0.71732557, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73978305, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.5864579677581787 + }, + { + "auxiliary_loss_clip": 0.01134537, + "auxiliary_loss_mlp": 0.01110698, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00054836, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.7569689167752307, + "language_loss": 0.70000279, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72245508, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.572019338607788 + }, + { + "auxiliary_loss_clip": 0.01151654, + "auxiliary_loss_mlp": 0.01111524, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00042057, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.3417317472805823, + "language_loss": 0.65155923, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.674191, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 4.179997205734253 + }, + { + "auxiliary_loss_clip": 0.01133234, + "auxiliary_loss_mlp": 0.0111059, + "balance_loss_clip": 1.00162578, + "balance_loss_mlp": 1.00044012, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 2.0181823998857356, + "language_loss": 0.72649777, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74893606, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.6147170066833496 + }, + { + "auxiliary_loss_clip": 0.01151529, + "auxiliary_loss_mlp": 0.0111071, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00065637, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.5586442224245671, + "language_loss": 0.80122918, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82385159, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 4.137570858001709 + }, + { + "auxiliary_loss_clip": 0.01151927, + "auxiliary_loss_mlp": 0.01112395, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00071979, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.8803953577903465, + "language_loss": 0.72256482, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74520808, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.565169095993042 + }, + { + "auxiliary_loss_clip": 0.01167145, + "auxiliary_loss_mlp": 0.01111893, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00050354, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.8870011202362074, + "language_loss": 0.85513413, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87792444, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.509113073348999 + }, + { + "auxiliary_loss_clip": 0.01131485, + "auxiliary_loss_mlp": 0.01091161, + "balance_loss_clip": 1.00138879, + "balance_loss_mlp": 0.99999017, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7594254889208597, + "language_loss": 0.561382, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58360851, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 3.024088144302368 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01111626, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00052261, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 1.4128178078750075, + "language_loss": 0.66375816, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68622518, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.6480000019073486 + }, + { + "auxiliary_loss_clip": 0.01151956, + "auxiliary_loss_mlp": 0.01112687, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00063062, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 1.8175798217196009, + "language_loss": 0.75358486, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77623141, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.6780316829681396 + }, + { + "auxiliary_loss_clip": 0.01166948, + "auxiliary_loss_mlp": 0.01111405, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.2302907972105144, + "language_loss": 0.78796768, + "learning_rate": 1.625421002822686e-06, + "loss": 0.8107512, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.5700812339782715 + }, + { + "auxiliary_loss_clip": 0.0115024, + "auxiliary_loss_mlp": 0.01111346, + "balance_loss_clip": 1.00186861, + "balance_loss_mlp": 1.00062418, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.6312285048549187, + "language_loss": 0.85819447, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.88081038, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 3.9557578563690186 + }, + { + "auxiliary_loss_clip": 0.01135406, + "auxiliary_loss_mlp": 0.01112302, + "balance_loss_clip": 1.00182855, + "balance_loss_mlp": 1.0006268, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 2.003179850320381, + "language_loss": 0.75000536, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77248251, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.6233694553375244 + }, + { + "auxiliary_loss_clip": 0.01137711, + "auxiliary_loss_mlp": 0.01112623, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00056601, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.6097616540708035, + "language_loss": 0.71583951, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73834276, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.6897692680358887 + }, + { + "auxiliary_loss_clip": 0.01120073, + "auxiliary_loss_mlp": 0.01111598, + "balance_loss_clip": 1.0017395, + "balance_loss_mlp": 1.00059044, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.7629962259395908, + "language_loss": 0.70124567, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.72356236, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.7034332752227783 + }, + { + "auxiliary_loss_clip": 0.01166971, + "auxiliary_loss_mlp": 0.01111353, + "balance_loss_clip": 1.00194108, + "balance_loss_mlp": 1.00063169, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.7090376373844978, + "language_loss": 0.62678665, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64956987, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.623119592666626 + }, + { + "auxiliary_loss_clip": 0.01150262, + "auxiliary_loss_mlp": 0.01112123, + "balance_loss_clip": 1.00192916, + "balance_loss_mlp": 1.00063848, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.5084654393899026, + "language_loss": 0.83410347, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85672736, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.5577614307403564 + }, + { + "auxiliary_loss_clip": 0.01103544, + "auxiliary_loss_mlp": 0.01111753, + "balance_loss_clip": 1.00173306, + "balance_loss_mlp": 1.0005542, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.390601759571253, + "language_loss": 0.73069316, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75284612, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.712406635284424 + }, + { + "auxiliary_loss_clip": 0.01150285, + "auxiliary_loss_mlp": 0.00747881, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00119817, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.6526873267306526, + "language_loss": 0.80070674, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.81968844, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.6108956336975098 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.01112514, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00055218, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.151048759887518, + "language_loss": 0.64567602, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66813272, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.633957624435425 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01111878, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00058389, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 1.9261680075546066, + "language_loss": 0.83338451, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85584426, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.560844659805298 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.01112309, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00053799, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 1.7796664265215612, + "language_loss": 0.73606789, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75837767, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.6795566082000732 + }, + { + "auxiliary_loss_clip": 0.01104328, + "auxiliary_loss_mlp": 0.01112166, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00039518, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 1.9044019028298904, + "language_loss": 0.75994653, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78211141, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.7032296657562256 + }, + { + "auxiliary_loss_clip": 0.01133441, + "auxiliary_loss_mlp": 0.0111127, + "balance_loss_clip": 1.00180399, + "balance_loss_mlp": 1.00064361, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.6568510060934065, + "language_loss": 0.5686298, + "learning_rate": 1.620448797546459e-06, + "loss": 0.59107697, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.6456100940704346 + }, + { + "auxiliary_loss_clip": 0.01135707, + "auxiliary_loss_mlp": 0.01111216, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00059032, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.0885246458631324, + "language_loss": 0.76194638, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.7844156, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.537529230117798 + }, + { + "auxiliary_loss_clip": 0.01152194, + "auxiliary_loss_mlp": 0.01111632, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00062478, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 1.8665063382824783, + "language_loss": 0.74430537, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76694369, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.5346686840057373 + }, + { + "auxiliary_loss_clip": 0.01122439, + "auxiliary_loss_mlp": 0.01112053, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00056815, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.0393541105796875, + "language_loss": 0.69670069, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71904558, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.641403913497925 + }, + { + "auxiliary_loss_clip": 0.01100892, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00058675, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.536236901611222, + "language_loss": 0.79681587, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81894839, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.6935369968414307 + }, + { + "auxiliary_loss_clip": 0.01135175, + "auxiliary_loss_mlp": 0.01112076, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00059128, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 2.4053214844408153, + "language_loss": 0.6802792, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70275164, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.5734078884124756 + }, + { + "auxiliary_loss_clip": 0.01116932, + "auxiliary_loss_mlp": 0.01112975, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00063193, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 2.7598395041527977, + "language_loss": 0.71918941, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74148846, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.654352903366089 + }, + { + "auxiliary_loss_clip": 0.0115069, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00052583, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.819673554569203, + "language_loss": 0.80112779, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82375479, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.570833921432495 + }, + { + "auxiliary_loss_clip": 0.01150684, + "auxiliary_loss_mlp": 0.01112726, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00047874, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 1.8379510669620505, + "language_loss": 0.83540952, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85804361, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.52597975730896 + }, + { + "auxiliary_loss_clip": 0.01151831, + "auxiliary_loss_mlp": 0.00747961, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00136745, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.2980077371874632, + "language_loss": 0.71039283, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72939074, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.610720157623291 + }, + { + "auxiliary_loss_clip": 0.01133622, + "auxiliary_loss_mlp": 0.01112285, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00051379, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.027516078673555, + "language_loss": 0.7240867, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.74654573, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.5480659008026123 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.0111148, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00066304, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 3.9043112551338983, + "language_loss": 0.74165833, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76427662, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 3.9878416061401367 + }, + { + "auxiliary_loss_clip": 0.01152183, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_clip": 1.00199234, + "balance_loss_mlp": 1.00072837, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.494441950587072, + "language_loss": 0.67814779, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70078981, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.511414051055908 + }, + { + "auxiliary_loss_clip": 0.01135773, + "auxiliary_loss_mlp": 0.01114159, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.000862, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 5.381673789912539, + "language_loss": 0.71127605, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73377538, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.551208257675171 + }, + { + "auxiliary_loss_clip": 0.01133577, + "auxiliary_loss_mlp": 0.00747793, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00134778, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.6105441292242004, + "language_loss": 0.79448998, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81330359, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.614116907119751 + }, + { + "auxiliary_loss_clip": 0.01085504, + "auxiliary_loss_mlp": 0.01111629, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00062108, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 1.6282363576578818, + "language_loss": 0.63948143, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66145277, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.708017349243164 + }, + { + "auxiliary_loss_clip": 0.01152465, + "auxiliary_loss_mlp": 0.0111279, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00063765, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.5634555966663501, + "language_loss": 0.71038353, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.7330361, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 3.9252381324768066 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00072896, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.6241618648285687, + "language_loss": 0.84050155, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86263961, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.634558916091919 + }, + { + "auxiliary_loss_clip": 0.01088819, + "auxiliary_loss_mlp": 0.01113534, + "balance_loss_clip": 1.00168014, + "balance_loss_mlp": 1.00071454, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 1.6799340901245954, + "language_loss": 0.57438332, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59640688, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.7084734439849854 + }, + { + "auxiliary_loss_clip": 0.01136112, + "auxiliary_loss_mlp": 0.01110996, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.0004648, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.5673819324007072, + "language_loss": 0.7565732, + "learning_rate": 1.613186112465078e-06, + "loss": 0.77904427, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 4.0685789585113525 + }, + { + "auxiliary_loss_clip": 0.01100535, + "auxiliary_loss_mlp": 0.01091172, + "balance_loss_clip": 1.00161004, + "balance_loss_mlp": 1.00000083, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7381533715948149, + "language_loss": 0.60717547, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62909245, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.3285186290740967 + }, + { + "auxiliary_loss_clip": 0.01135707, + "auxiliary_loss_mlp": 0.01111868, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00057435, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 2.1265508134049558, + "language_loss": 0.75722152, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77969724, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.573037624359131 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01112168, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00058818, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5377147221046632, + "language_loss": 0.74522752, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76784766, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.5515921115875244 + }, + { + "auxiliary_loss_clip": 0.0116711, + "auxiliary_loss_mlp": 0.01111984, + "balance_loss_clip": 1.00198638, + "balance_loss_mlp": 1.00049973, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.570692737144029, + "language_loss": 0.71537602, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73816693, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.5279455184936523 + }, + { + "auxiliary_loss_clip": 0.01150283, + "auxiliary_loss_mlp": 0.01112337, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00066173, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.262985351803966, + "language_loss": 0.55580199, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57842821, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.621917724609375 + }, + { + "auxiliary_loss_clip": 0.01166765, + "auxiliary_loss_mlp": 0.01111205, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00076914, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.4711822555696688, + "language_loss": 0.64497411, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66775388, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 4.001584768295288 + }, + { + "auxiliary_loss_clip": 0.01152005, + "auxiliary_loss_mlp": 0.01111726, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.4374518573543584, + "language_loss": 0.67167401, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69431132, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.8430662155151367 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01111579, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00057125, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.793896314324206, + "language_loss": 0.72161061, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74407274, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.6259260177612305 + }, + { + "auxiliary_loss_clip": 0.01166913, + "auxiliary_loss_mlp": 0.01110505, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00054669, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.6925636987848305, + "language_loss": 0.76315129, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78592545, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.690629482269287 + }, + { + "auxiliary_loss_clip": 0.01103697, + "auxiliary_loss_mlp": 0.01112768, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00061584, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 3.7456386871532668, + "language_loss": 0.66617668, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68834126, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.7142841815948486 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01111499, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00068235, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.4236028519395223, + "language_loss": 0.79999036, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.8224591, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.6254963874816895 + }, + { + "auxiliary_loss_clip": 0.01133357, + "auxiliary_loss_mlp": 0.01111556, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00054836, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.7158681384993568, + "language_loss": 0.69271231, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71516144, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.5770065784454346 + }, + { + "auxiliary_loss_clip": 0.0115029, + "auxiliary_loss_mlp": 0.01110905, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.00056505, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.5802983956025898, + "language_loss": 0.66351485, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68612677, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.522376775741577 + }, + { + "auxiliary_loss_clip": 0.01132886, + "auxiliary_loss_mlp": 0.01110694, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00054514, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5846454846618327, + "language_loss": 0.72766781, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75010359, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.5741539001464844 + }, + { + "auxiliary_loss_clip": 0.01133738, + "auxiliary_loss_mlp": 0.01113483, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00056756, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.2400142979582407, + "language_loss": 0.64399236, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66646457, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.607823371887207 + }, + { + "auxiliary_loss_clip": 0.01137093, + "auxiliary_loss_mlp": 0.01111983, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00068951, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.894988205751519, + "language_loss": 0.85418767, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87667847, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.5527703762054443 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01112674, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00071228, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 1.977276242671548, + "language_loss": 0.67247546, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69527555, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.4930038452148438 + }, + { + "auxiliary_loss_clip": 0.01133356, + "auxiliary_loss_mlp": 0.01091156, + "balance_loss_clip": 1.00143504, + "balance_loss_mlp": 0.99998486, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6412632831486049, + "language_loss": 0.57203197, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59427714, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.3232336044311523 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01112097, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.0006125, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.773984513375374, + "language_loss": 0.82281309, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84529102, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 2.5464224815368652 + }, + { + "auxiliary_loss_clip": 0.011631, + "auxiliary_loss_mlp": 0.01091197, + "balance_loss_clip": 1.00146937, + "balance_loss_mlp": 1.00002611, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6215509657698308, + "language_loss": 0.4954204, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51796335, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.1424999237060547 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01110343, + "balance_loss_clip": 1.00169241, + "balance_loss_mlp": 1.00047994, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.4577720674785446, + "language_loss": 0.84790766, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87033963, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.5692713260650635 + }, + { + "auxiliary_loss_clip": 0.01150218, + "auxiliary_loss_mlp": 0.01111196, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00056934, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.7386921522484962, + "language_loss": 0.80160785, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82422197, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.56526780128479 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.0111225, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00057435, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.4303974245419577, + "language_loss": 0.65931416, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68179286, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.588927745819092 + }, + { + "auxiliary_loss_clip": 0.01135918, + "auxiliary_loss_mlp": 0.01112223, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.00064278, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7829318963406051, + "language_loss": 0.78069597, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80317736, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.5985186100006104 + }, + { + "auxiliary_loss_clip": 0.01166759, + "auxiliary_loss_mlp": 0.01110432, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00037766, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.9768895210636765, + "language_loss": 0.7948103, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81758219, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.5014994144439697 + }, + { + "auxiliary_loss_clip": 0.01085218, + "auxiliary_loss_mlp": 0.00747875, + "balance_loss_clip": 1.00154519, + "balance_loss_mlp": 1.00119674, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 3.39696730618771, + "language_loss": 0.63082606, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64915693, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.7207629680633545 + }, + { + "auxiliary_loss_clip": 0.0116703, + "auxiliary_loss_mlp": 0.00747865, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00123286, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.5246061974853464, + "language_loss": 0.77954841, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.79869735, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.5721004009246826 + }, + { + "auxiliary_loss_clip": 0.01085527, + "auxiliary_loss_mlp": 0.01091566, + "balance_loss_clip": 1.00124252, + "balance_loss_mlp": 1.00001359, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 1.0699022535013547, + "language_loss": 0.59651232, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61828321, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.441037893295288 + }, + { + "auxiliary_loss_clip": 0.01152114, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.0018667, + "balance_loss_mlp": 1.00067306, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.5310035347522035, + "language_loss": 0.70860589, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73125243, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.6531667709350586 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01110728, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.7455036064370522, + "language_loss": 0.70695066, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.72922611, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 4.091912031173706 + }, + { + "auxiliary_loss_clip": 0.01166942, + "auxiliary_loss_mlp": 0.01111257, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00063062, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.009893322251743, + "language_loss": 0.69991958, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72270155, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.487672805786133 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01112551, + "balance_loss_clip": 1.00192714, + "balance_loss_mlp": 1.00049436, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 3.9430899073571517, + "language_loss": 0.67445177, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69691265, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.739243745803833 + }, + { + "auxiliary_loss_clip": 0.01133543, + "auxiliary_loss_mlp": 0.01111276, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00064969, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.9439887891050505, + "language_loss": 0.81716567, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83961391, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.596343517303467 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01110747, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00059795, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.4598841745933235, + "language_loss": 0.72695756, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74928868, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.6183576583862305 + }, + { + "auxiliary_loss_clip": 0.01166842, + "auxiliary_loss_mlp": 0.01110581, + "balance_loss_clip": 1.00199592, + "balance_loss_mlp": 1.00052679, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.6965376163936587, + "language_loss": 0.78139383, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80416805, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 3.852156400680542 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.0074799, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00134325, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5927015242567635, + "language_loss": 0.72150707, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74050987, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.5841004848480225 + }, + { + "auxiliary_loss_clip": 0.01088664, + "auxiliary_loss_mlp": 0.01111253, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00062633, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.8726414603610866, + "language_loss": 0.68731642, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70931566, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.702267646789551 + }, + { + "auxiliary_loss_clip": 0.01134948, + "auxiliary_loss_mlp": 0.01110381, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00061285, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.497442040976782, + "language_loss": 0.72955763, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.752011, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 4.030431032180786 + }, + { + "auxiliary_loss_clip": 0.01150174, + "auxiliary_loss_mlp": 0.01110693, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00054359, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.5192103303132913, + "language_loss": 0.76339346, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78600216, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.5671732425689697 + }, + { + "auxiliary_loss_clip": 0.01118687, + "auxiliary_loss_mlp": 0.01111655, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00055194, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.6368011792173538, + "language_loss": 0.83447969, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85678309, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.5899384021759033 + }, + { + "auxiliary_loss_clip": 0.01118839, + "auxiliary_loss_mlp": 0.01113465, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00045431, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.5805054835630215, + "language_loss": 0.78220701, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80453002, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.6521267890930176 + }, + { + "auxiliary_loss_clip": 0.01167077, + "auxiliary_loss_mlp": 0.01112046, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00056171, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.5580005681237432, + "language_loss": 0.73841834, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76120961, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.4797074794769287 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.01111913, + "balance_loss_clip": 1.00170839, + "balance_loss_mlp": 1.00052428, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.9646156292382069, + "language_loss": 0.69758368, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71989417, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 4.000013828277588 + }, + { + "auxiliary_loss_clip": 0.01118192, + "auxiliary_loss_mlp": 0.01111621, + "balance_loss_clip": 1.00178254, + "balance_loss_mlp": 1.00061369, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 2.2977299828048507, + "language_loss": 0.7663942, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78869236, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.701765298843384 + }, + { + "auxiliary_loss_clip": 0.01118906, + "auxiliary_loss_mlp": 0.01109965, + "balance_loss_clip": 1.00166953, + "balance_loss_mlp": 1.0004828, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.5356604538218441, + "language_loss": 0.77290332, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79519206, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.6325807571411133 + }, + { + "auxiliary_loss_clip": 0.01120194, + "auxiliary_loss_mlp": 0.01110991, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00036478, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.4180532493963836, + "language_loss": 0.68657845, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70889026, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.6410224437713623 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01111012, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00048065, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.6487417101820128, + "language_loss": 0.83376992, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85640132, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.5504472255706787 + }, + { + "auxiliary_loss_clip": 0.01166945, + "auxiliary_loss_mlp": 0.01112021, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00063157, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.5593705439191576, + "language_loss": 0.79605079, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81884038, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.5154950618743896 + }, + { + "auxiliary_loss_clip": 0.01151261, + "auxiliary_loss_mlp": 0.0110993, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00054383, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.693412420826321, + "language_loss": 0.77791095, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.80052286, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.5324299335479736 + }, + { + "auxiliary_loss_clip": 0.011168, + "auxiliary_loss_mlp": 0.01110818, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00047815, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.7359269195837648, + "language_loss": 0.81061465, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83289075, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.623472213745117 + }, + { + "auxiliary_loss_clip": 0.01152176, + "auxiliary_loss_mlp": 0.01112343, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00076282, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.4698356738972176, + "language_loss": 0.67374891, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69639409, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.574035882949829 + }, + { + "auxiliary_loss_clip": 0.01151096, + "auxiliary_loss_mlp": 0.01110881, + "balance_loss_clip": 1.00184023, + "balance_loss_mlp": 1.00054121, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.658831740163479, + "language_loss": 0.77488571, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.7975055, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.5551419258117676 + }, + { + "auxiliary_loss_clip": 0.01133538, + "auxiliary_loss_mlp": 0.01110576, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00061703, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.489373867900184, + "language_loss": 0.75005794, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77249908, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.6223220825195312 + }, + { + "auxiliary_loss_clip": 0.0116685, + "auxiliary_loss_mlp": 0.01110389, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00043082, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.4668082541434595, + "language_loss": 0.81351745, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83628982, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.512563943862915 + }, + { + "auxiliary_loss_clip": 0.01135196, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00051856, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.6319164453632375, + "language_loss": 0.7227273, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74518692, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.608978748321533 + }, + { + "auxiliary_loss_clip": 0.01134861, + "auxiliary_loss_mlp": 0.01110924, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00058389, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 2.1254510917690816, + "language_loss": 0.77202749, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79448533, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.5999279022216797 + }, + { + "auxiliary_loss_clip": 0.01088735, + "auxiliary_loss_mlp": 0.01111985, + "balance_loss_clip": 1.00165176, + "balance_loss_mlp": 1.00050092, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.4712447958629316, + "language_loss": 0.70290279, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72491002, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.733461856842041 + }, + { + "auxiliary_loss_clip": 0.01129656, + "auxiliary_loss_mlp": 0.01090815, + "balance_loss_clip": 1.00134706, + "balance_loss_mlp": 1.00002515, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7771947886384407, + "language_loss": 0.56001699, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58222163, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.179870367050171 + }, + { + "auxiliary_loss_clip": 0.01116675, + "auxiliary_loss_mlp": 0.01111581, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00066853, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 1.8735542597832184, + "language_loss": 0.71317345, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73545605, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.6973586082458496 + }, + { + "auxiliary_loss_clip": 0.01120073, + "auxiliary_loss_mlp": 0.01110882, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00063682, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 1.7799231684694083, + "language_loss": 0.8248179, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.8471275, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.6346054077148438 + }, + { + "auxiliary_loss_clip": 0.01166856, + "auxiliary_loss_mlp": 0.01111218, + "balance_loss_clip": 1.00204301, + "balance_loss_mlp": 1.00049639, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.390891235526039, + "language_loss": 0.69904387, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72182465, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.598865509033203 + }, + { + "auxiliary_loss_clip": 0.0113479, + "auxiliary_loss_mlp": 0.01110384, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00061584, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.4594593524394663, + "language_loss": 0.71485269, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73730445, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.6659445762634277 + }, + { + "auxiliary_loss_clip": 0.01152005, + "auxiliary_loss_mlp": 0.01110206, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00043821, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.6060446306074543, + "language_loss": 0.84145761, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86407971, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.584681749343872 + }, + { + "auxiliary_loss_clip": 0.01134809, + "auxiliary_loss_mlp": 0.01111129, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00050306, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.6122166199290309, + "language_loss": 0.72310102, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74556035, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.6144776344299316 + }, + { + "auxiliary_loss_clip": 0.01135165, + "auxiliary_loss_mlp": 0.01111416, + "balance_loss_clip": 1.00192785, + "balance_loss_mlp": 1.00050342, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 1.9777840227174481, + "language_loss": 0.74446374, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76692951, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.592094659805298 + }, + { + "auxiliary_loss_clip": 0.01119039, + "auxiliary_loss_mlp": 0.0074782, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00125647, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.5253530348387292, + "language_loss": 0.79072392, + "learning_rate": 1.587999618060523e-06, + "loss": 0.80939245, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.63911771774292 + }, + { + "auxiliary_loss_clip": 0.01166977, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00058436, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.540978186225759, + "language_loss": 0.75146705, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77424788, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.5483694076538086 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.01111364, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00045204, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 1.6317926079391687, + "language_loss": 0.79186845, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81431866, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 4.0666656494140625 + }, + { + "auxiliary_loss_clip": 0.01118645, + "auxiliary_loss_mlp": 0.01112953, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00070572, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7948473491760568, + "language_loss": 0.77673268, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79904866, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.6720709800720215 + }, + { + "auxiliary_loss_clip": 0.01136412, + "auxiliary_loss_mlp": 0.01112295, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.00061965, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 1.9870767015987396, + "language_loss": 0.63615656, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65864372, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.661314010620117 + }, + { + "auxiliary_loss_clip": 0.0113701, + "auxiliary_loss_mlp": 0.0111056, + "balance_loss_clip": 1.00204074, + "balance_loss_mlp": 1.00060153, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 2.595534241666951, + "language_loss": 0.77350682, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79598254, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.6102724075317383 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01110378, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.5605582242717688, + "language_loss": 0.68159682, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70405185, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.5881264209747314 + }, + { + "auxiliary_loss_clip": 0.01101079, + "auxiliary_loss_mlp": 0.01110569, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00051463, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 3.170137893779487, + "language_loss": 0.72251642, + "learning_rate": 1.585332242234043e-06, + "loss": 0.7446329, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 3.9878222942352295 + }, + { + "auxiliary_loss_clip": 0.01150246, + "auxiliary_loss_mlp": 0.01111012, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.0005765, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.7730711631314116, + "language_loss": 0.72242004, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74503267, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.5511889457702637 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01111337, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00071096, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 1.6487183010209638, + "language_loss": 0.69314826, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.7156142, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.5653324127197266 + }, + { + "auxiliary_loss_clip": 0.01135285, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.0006547, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 4.491856417071504, + "language_loss": 0.77540797, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79788888, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 3.983874797821045 + }, + { + "auxiliary_loss_clip": 0.01166826, + "auxiliary_loss_mlp": 0.01110895, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00045919, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.7027953257036272, + "language_loss": 0.73781389, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.76059109, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.522505044937134 + }, + { + "auxiliary_loss_clip": 0.01136304, + "auxiliary_loss_mlp": 0.01111326, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.0005095, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.6186848909960139, + "language_loss": 0.73230934, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75478566, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.6285877227783203 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01111515, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00050759, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.740577417190611, + "language_loss": 0.67223275, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69501829, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.510073184967041 + }, + { + "auxiliary_loss_clip": 0.011521, + "auxiliary_loss_mlp": 0.01112273, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00050259, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.3265370576561923, + "language_loss": 0.85639757, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87904131, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.5434341430664062 + }, + { + "auxiliary_loss_clip": 0.01167011, + "auxiliary_loss_mlp": 0.01111653, + "balance_loss_clip": 1.00207841, + "balance_loss_mlp": 1.00055003, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.7686407657314167, + "language_loss": 0.75185573, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77464241, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 3.9501967430114746 + }, + { + "auxiliary_loss_clip": 0.01133167, + "auxiliary_loss_mlp": 0.01113421, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00060105, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.6696445591058857, + "language_loss": 0.58792287, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6103887, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.732429265975952 + }, + { + "auxiliary_loss_clip": 0.01104634, + "auxiliary_loss_mlp": 0.01112142, + "balance_loss_clip": 1.00171304, + "balance_loss_mlp": 1.00084782, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.6086325149302412, + "language_loss": 0.84176838, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86393607, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.6579904556274414 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01090445, + "balance_loss_clip": 1.00143373, + "balance_loss_mlp": 1.00003684, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8387556123854323, + "language_loss": 0.63006461, + "learning_rate": 1.581142210256242e-06, + "loss": 0.6524334, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.1675126552581787 + }, + { + "auxiliary_loss_clip": 0.01122277, + "auxiliary_loss_mlp": 0.0111047, + "balance_loss_clip": 1.00187027, + "balance_loss_mlp": 1.00051165, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.69070074065635, + "language_loss": 0.82350868, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84583622, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.606424570083618 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01111473, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00046563, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.044442577253892, + "language_loss": 0.77263165, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79492736, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.6280980110168457 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01112137, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.00074768, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 3.3291507540684715, + "language_loss": 0.74442172, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76689357, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.5805749893188477 + }, + { + "auxiliary_loss_clip": 0.01133262, + "auxiliary_loss_mlp": 0.01111528, + "balance_loss_clip": 1.00185239, + "balance_loss_mlp": 1.00042534, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.186936564628334, + "language_loss": 0.76782882, + "learning_rate": 1.579619037747193e-06, + "loss": 0.79027665, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.641185760498047 + }, + { + "auxiliary_loss_clip": 0.0116694, + "auxiliary_loss_mlp": 0.01111425, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00051296, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 1.9164302627686456, + "language_loss": 0.74293584, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76571953, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.515533447265625 + }, + { + "auxiliary_loss_clip": 0.01085473, + "auxiliary_loss_mlp": 0.01110415, + "balance_loss_clip": 1.00154924, + "balance_loss_mlp": 1.00083828, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.87959023534163, + "language_loss": 0.70134163, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72330046, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.730525255203247 + }, + { + "auxiliary_loss_clip": 0.01166909, + "auxiliary_loss_mlp": 0.01111193, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.00047183, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 2.3299527746240054, + "language_loss": 0.69824672, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.72102773, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.513828754425049 + }, + { + "auxiliary_loss_clip": 0.01149885, + "auxiliary_loss_mlp": 0.011104, + "balance_loss_clip": 1.00192392, + "balance_loss_mlp": 1.00063217, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.6838001064500145, + "language_loss": 0.71667981, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73928273, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.542632818222046 + }, + { + "auxiliary_loss_clip": 0.01150114, + "auxiliary_loss_mlp": 0.01112769, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00061643, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 1.8061989368826847, + "language_loss": 0.71206582, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73469466, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.568376064300537 + }, + { + "auxiliary_loss_clip": 0.01146448, + "auxiliary_loss_mlp": 0.0109045, + "balance_loss_clip": 1.00148082, + "balance_loss_mlp": 1.0000416, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6448498720353321, + "language_loss": 0.53598779, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55835676, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.1445634365081787 + }, + { + "auxiliary_loss_clip": 0.01151676, + "auxiliary_loss_mlp": 0.01111078, + "balance_loss_clip": 1.00199556, + "balance_loss_mlp": 1.00073802, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 1.9036762312623057, + "language_loss": 0.61902702, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64165455, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.6265740394592285 + }, + { + "auxiliary_loss_clip": 0.01152115, + "auxiliary_loss_mlp": 0.01111406, + "balance_loss_clip": 1.00177193, + "balance_loss_mlp": 1.00049329, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.8715343745777595, + "language_loss": 0.65328568, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67592084, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.565171241760254 + }, + { + "auxiliary_loss_clip": 0.01103793, + "auxiliary_loss_mlp": 0.01110106, + "balance_loss_clip": 1.00165141, + "balance_loss_mlp": 1.00062418, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4314472542654852, + "language_loss": 0.74673676, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76887578, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.6382174491882324 + }, + { + "auxiliary_loss_clip": 0.01162948, + "auxiliary_loss_mlp": 0.01090416, + "balance_loss_clip": 1.00144541, + "balance_loss_mlp": 1.00000751, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8841141035074566, + "language_loss": 0.58427215, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.6068058, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.1517210006713867 + }, + { + "auxiliary_loss_clip": 0.01135266, + "auxiliary_loss_mlp": 0.01110905, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00046945, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.0211245833267175, + "language_loss": 0.81769848, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84016025, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 2.6163485050201416 + }, + { + "auxiliary_loss_clip": 0.01135117, + "auxiliary_loss_mlp": 0.00747846, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00110769, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.8165405907880676, + "language_loss": 0.81242657, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83125615, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.655357599258423 + }, + { + "auxiliary_loss_clip": 0.01133944, + "auxiliary_loss_mlp": 0.01111628, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00071561, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.5578013909908883, + "language_loss": 0.8117497, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83420539, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.6288704872131348 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.01111025, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00058997, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.6534423774563942, + "language_loss": 0.79979455, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82240748, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.5251266956329346 + }, + { + "auxiliary_loss_clip": 0.01151354, + "auxiliary_loss_mlp": 0.01112095, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00060999, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.3677770168706285, + "language_loss": 0.78771436, + "learning_rate": 1.573909419957653e-06, + "loss": 0.81034887, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 2.599013090133667 + }, + { + "auxiliary_loss_clip": 0.01133367, + "auxiliary_loss_mlp": 0.01110795, + "balance_loss_clip": 1.00177968, + "balance_loss_mlp": 1.0006454, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 1.819819915923553, + "language_loss": 0.64419627, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66663778, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.7925407886505127 + }, + { + "auxiliary_loss_clip": 0.01102294, + "auxiliary_loss_mlp": 0.01110829, + "balance_loss_clip": 1.00156832, + "balance_loss_mlp": 1.00058413, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.5478472557860223, + "language_loss": 0.7317878, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75391901, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.701207399368286 + }, + { + "auxiliary_loss_clip": 0.01118578, + "auxiliary_loss_mlp": 0.0111158, + "balance_loss_clip": 1.0018419, + "balance_loss_mlp": 1.00085878, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 1.8302165789240379, + "language_loss": 0.78465796, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.80695951, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 4.040229082107544 + }, + { + "auxiliary_loss_clip": 0.01102116, + "auxiliary_loss_mlp": 0.0111279, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00063825, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 2.3923491719965857, + "language_loss": 0.60755169, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62970078, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.671469211578369 + }, + { + "auxiliary_loss_clip": 0.01103301, + "auxiliary_loss_mlp": 0.01110904, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00046825, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.5956605622593418, + "language_loss": 0.81625134, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83839333, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.684177875518799 + }, + { + "auxiliary_loss_clip": 0.01118558, + "auxiliary_loss_mlp": 0.01112132, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00055218, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.8525196777344266, + "language_loss": 0.8827939, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90510082, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.637932777404785 + }, + { + "auxiliary_loss_clip": 0.01166871, + "auxiliary_loss_mlp": 0.00747584, + "balance_loss_clip": 1.00194848, + "balance_loss_mlp": 1.00102711, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.4814682846893767, + "language_loss": 0.79048336, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80962783, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.5388026237487793 + }, + { + "auxiliary_loss_clip": 0.01152085, + "auxiliary_loss_mlp": 0.01111169, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00063825, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.488731363542073, + "language_loss": 0.70310426, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72573686, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 3.9307403564453125 + }, + { + "auxiliary_loss_clip": 0.0108933, + "auxiliary_loss_mlp": 0.01111191, + "balance_loss_clip": 1.00203931, + "balance_loss_mlp": 1.00056481, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.3880568145181127, + "language_loss": 0.63361275, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65561801, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.711439609527588 + }, + { + "auxiliary_loss_clip": 0.01131696, + "auxiliary_loss_mlp": 0.01090535, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00012648, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8115505344490221, + "language_loss": 0.54235601, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56457835, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.2346811294555664 + }, + { + "auxiliary_loss_clip": 0.01131652, + "auxiliary_loss_mlp": 0.01090802, + "balance_loss_clip": 1.00141788, + "balance_loss_mlp": 1.0000124, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7347309714986693, + "language_loss": 0.56171066, + "learning_rate": 1.569724674667319e-06, + "loss": 0.5839352, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 4.408871412277222 + }, + { + "auxiliary_loss_clip": 0.01166879, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_clip": 1.00195467, + "balance_loss_mlp": 1.00056863, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.5082308259792856, + "language_loss": 0.65581095, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67858785, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.513972759246826 + }, + { + "auxiliary_loss_clip": 0.01133323, + "auxiliary_loss_mlp": 0.01110795, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00055051, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.7768016753128977, + "language_loss": 0.83306372, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85550493, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.589869260787964 + }, + { + "auxiliary_loss_clip": 0.01166849, + "auxiliary_loss_mlp": 0.01110411, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00035739, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.6261843156960023, + "language_loss": 0.75534379, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77811635, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.487375497817993 + }, + { + "auxiliary_loss_clip": 0.01087488, + "auxiliary_loss_mlp": 0.01110934, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00049853, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 1.9673914114411155, + "language_loss": 0.7459563, + "learning_rate": 1.568203437579977e-06, + "loss": 0.76794052, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.7129626274108887 + }, + { + "auxiliary_loss_clip": 0.01134107, + "auxiliary_loss_mlp": 0.01111103, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00057256, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.8789821345073374, + "language_loss": 0.74165648, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.7641086, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 4.096129655838013 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01110071, + "balance_loss_clip": 1.00189149, + "balance_loss_mlp": 1.00058901, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 1.9990621692255603, + "language_loss": 0.77753556, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.79996431, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.638493537902832 + }, + { + "auxiliary_loss_clip": 0.01166765, + "auxiliary_loss_mlp": 0.01111666, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00075388, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.6718976184897747, + "language_loss": 0.75096667, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.5057339668273926 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.0108932, + "balance_loss_clip": 1.00150633, + "balance_loss_mlp": 1.00005591, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.812898994421356, + "language_loss": 0.57373995, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59609836, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 2.947535514831543 + }, + { + "auxiliary_loss_clip": 0.01119632, + "auxiliary_loss_mlp": 0.01111263, + "balance_loss_clip": 1.00161254, + "balance_loss_mlp": 1.00054097, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.8004887525559992, + "language_loss": 0.69902891, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72133785, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.6184022426605225 + }, + { + "auxiliary_loss_clip": 0.01152069, + "auxiliary_loss_mlp": 0.01110686, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00053644, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 1.9018664921638524, + "language_loss": 0.64695477, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.66958237, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.5716161727905273 + }, + { + "auxiliary_loss_clip": 0.01134884, + "auxiliary_loss_mlp": 0.00747648, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00111747, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5866153590008154, + "language_loss": 0.73311579, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75194114, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.644695520401001 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01111166, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00082624, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.6588085825580614, + "language_loss": 0.75895846, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78142107, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.614248752593994 + }, + { + "auxiliary_loss_clip": 0.01151332, + "auxiliary_loss_mlp": 0.01109924, + "balance_loss_clip": 1.00176036, + "balance_loss_mlp": 1.0005374, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.554756971367137, + "language_loss": 0.80787957, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.83049208, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.647644281387329 + }, + { + "auxiliary_loss_clip": 0.01146497, + "auxiliary_loss_mlp": 0.01089438, + "balance_loss_clip": 1.00147963, + "balance_loss_mlp": 1.00017393, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.754384606426806, + "language_loss": 0.56982523, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.59218454, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.1008694171905518 + }, + { + "auxiliary_loss_clip": 0.01150179, + "auxiliary_loss_mlp": 0.00747793, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.0012995, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.028561285775001, + "language_loss": 0.79203075, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81101048, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.563427209854126 + }, + { + "auxiliary_loss_clip": 0.01135106, + "auxiliary_loss_mlp": 0.0110994, + "balance_loss_clip": 1.00166905, + "balance_loss_mlp": 1.00064898, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3724126979633102, + "language_loss": 0.76239985, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78485036, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.575070858001709 + }, + { + "auxiliary_loss_clip": 0.01146496, + "auxiliary_loss_mlp": 0.01090099, + "balance_loss_clip": 1.00146532, + "balance_loss_mlp": 1.00007236, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7824705049986121, + "language_loss": 0.54973888, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57210487, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.1980674266815186 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01110547, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00049269, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 1.7890993435041294, + "language_loss": 0.76483285, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78711897, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.602356433868408 + }, + { + "auxiliary_loss_clip": 0.01166853, + "auxiliary_loss_mlp": 0.01111311, + "balance_loss_clip": 1.00186026, + "balance_loss_mlp": 1.00058997, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.4730683863327731, + "language_loss": 0.77540219, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.7981838, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.5139353275299072 + }, + { + "auxiliary_loss_clip": 0.01104229, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00070965, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.6673794206416348, + "language_loss": 0.8351745, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85732627, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.785224199295044 + }, + { + "auxiliary_loss_clip": 0.01136908, + "auxiliary_loss_mlp": 0.01110694, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00054467, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.007228417395727, + "language_loss": 0.66056824, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68304431, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.5913960933685303 + }, + { + "auxiliary_loss_clip": 0.01151914, + "auxiliary_loss_mlp": 0.01110517, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00055802, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.521132037726359, + "language_loss": 0.71671337, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73933768, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.5964062213897705 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_clip": 1.00164485, + "balance_loss_mlp": 1.00058746, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.6863181449902822, + "language_loss": 0.84905469, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87148523, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.595599412918091 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01109674, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00057411, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.410584703513936, + "language_loss": 0.7823782, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80497408, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.553713321685791 + }, + { + "auxiliary_loss_clip": 0.01166887, + "auxiliary_loss_mlp": 0.01110399, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00063062, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.8182095026624987, + "language_loss": 0.71484983, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73762274, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 2.501723289489746 + }, + { + "auxiliary_loss_clip": 0.01133302, + "auxiliary_loss_mlp": 0.01110279, + "balance_loss_clip": 1.0018971, + "balance_loss_mlp": 1.00060689, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.5143217336704538, + "language_loss": 0.81294239, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83537823, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.6051785945892334 + }, + { + "auxiliary_loss_clip": 0.01120983, + "auxiliary_loss_mlp": 0.011097, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00059962, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.5337088192387955, + "language_loss": 0.80652773, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82883453, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.6279380321502686 + }, + { + "auxiliary_loss_clip": 0.01152051, + "auxiliary_loss_mlp": 0.01110374, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00060654, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.3414379256886444, + "language_loss": 0.75086105, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7734853, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.7812466621398926 + }, + { + "auxiliary_loss_clip": 0.01119795, + "auxiliary_loss_mlp": 0.01108777, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00053537, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.5359855749957656, + "language_loss": 0.81889808, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.84118384, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 4.104487419128418 + }, + { + "auxiliary_loss_clip": 0.01150489, + "auxiliary_loss_mlp": 0.01110116, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00053859, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.4694426874973434, + "language_loss": 0.78573877, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80834484, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.586073398590088 + }, + { + "auxiliary_loss_clip": 0.01131811, + "auxiliary_loss_mlp": 0.01089326, + "balance_loss_clip": 1.0014993, + "balance_loss_mlp": 1.00006187, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7647952845609429, + "language_loss": 0.56599373, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58820504, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.173067569732666 + }, + { + "auxiliary_loss_clip": 0.01118379, + "auxiliary_loss_mlp": 0.01108855, + "balance_loss_clip": 1.0016675, + "balance_loss_mlp": 1.00061297, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.4401451172472668, + "language_loss": 0.65688753, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67915988, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.669879674911499 + }, + { + "auxiliary_loss_clip": 0.01166938, + "auxiliary_loss_mlp": 0.01111765, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00056624, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 3.0871318258432403, + "language_loss": 0.78427434, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80706137, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.5390543937683105 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.00747767, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00122142, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.6029681954551749, + "language_loss": 0.73526722, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75396633, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.6779699325561523 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01110629, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00047994, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.7621963578864461, + "language_loss": 0.69341534, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71585536, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 4.105561256408691 + }, + { + "auxiliary_loss_clip": 0.01166787, + "auxiliary_loss_mlp": 0.01110323, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00045943, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.6121614579215842, + "language_loss": 0.80121922, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82399029, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.5088019371032715 + }, + { + "auxiliary_loss_clip": 0.01136511, + "auxiliary_loss_mlp": 0.011101, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00052261, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 1.9377549780716765, + "language_loss": 0.7237525, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74621856, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 2.5792479515075684 + }, + { + "auxiliary_loss_clip": 0.01135165, + "auxiliary_loss_mlp": 0.01109652, + "balance_loss_clip": 1.00184095, + "balance_loss_mlp": 1.00055158, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.5846701132951646, + "language_loss": 0.74895978, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77140796, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 3.990078926086426 + }, + { + "auxiliary_loss_clip": 0.01151374, + "auxiliary_loss_mlp": 0.01110402, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00073004, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 1.997273310837301, + "language_loss": 0.80410606, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82672381, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.6256513595581055 + }, + { + "auxiliary_loss_clip": 0.01135143, + "auxiliary_loss_mlp": 0.01110964, + "balance_loss_clip": 1.00185406, + "balance_loss_mlp": 1.0006237, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.4282311697636405, + "language_loss": 0.67809278, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.70055389, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.585930824279785 + }, + { + "auxiliary_loss_clip": 0.01166732, + "auxiliary_loss_mlp": 0.01110478, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.00061476, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.051795086022797, + "language_loss": 0.75886929, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78164136, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.573343276977539 + }, + { + "auxiliary_loss_clip": 0.01116168, + "auxiliary_loss_mlp": 0.01110643, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00058866, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.641740633908093, + "language_loss": 0.82757068, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.84983873, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.61653995513916 + }, + { + "auxiliary_loss_clip": 0.01163, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_clip": 1.00149035, + "balance_loss_mlp": 1.00001526, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9278803602674459, + "language_loss": 0.71378672, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73631716, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 4.46537184715271 + }, + { + "auxiliary_loss_clip": 0.0115222, + "auxiliary_loss_mlp": 0.01110926, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00077653, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 1.9218937192941128, + "language_loss": 0.89278316, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91541457, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.526109218597412 + }, + { + "auxiliary_loss_clip": 0.01133521, + "auxiliary_loss_mlp": 0.01109889, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00059748, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.3260596504252369, + "language_loss": 0.68406922, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70650333, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.588204860687256 + }, + { + "auxiliary_loss_clip": 0.01150184, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00052321, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.720669325355941, + "language_loss": 0.86269534, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88530105, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.527904510498047 + }, + { + "auxiliary_loss_clip": 0.01132106, + "auxiliary_loss_mlp": 0.01111296, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00067008, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.6089478234745411, + "language_loss": 0.8288691, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.8513031, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.5708603858947754 + }, + { + "auxiliary_loss_clip": 0.01082365, + "auxiliary_loss_mlp": 0.00747823, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00134575, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.640557207927404, + "language_loss": 0.67041099, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68871295, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.7222328186035156 + }, + { + "auxiliary_loss_clip": 0.01122482, + "auxiliary_loss_mlp": 0.01111127, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00069189, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.831651076134704, + "language_loss": 0.81660104, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83893716, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.616319417953491 + }, + { + "auxiliary_loss_clip": 0.01151937, + "auxiliary_loss_mlp": 0.01109935, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00073957, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.7029559948891422, + "language_loss": 0.77744514, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80006385, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.5834007263183594 + }, + { + "auxiliary_loss_clip": 0.01152211, + "auxiliary_loss_mlp": 0.01110682, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00062764, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.467567990335516, + "language_loss": 0.70621651, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72884548, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.6036264896392822 + }, + { + "auxiliary_loss_clip": 0.01166893, + "auxiliary_loss_mlp": 0.01111823, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00062442, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.7573338786227584, + "language_loss": 0.7871356, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80992281, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.5222740173339844 + }, + { + "auxiliary_loss_clip": 0.0115167, + "auxiliary_loss_mlp": 0.01110848, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.0006988, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.9848014509917802, + "language_loss": 0.70591533, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72854054, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.5964627265930176 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_clip": 1.00197709, + "balance_loss_mlp": 1.00063384, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 2.854090411463037, + "language_loss": 0.52744913, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54963923, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.7085163593292236 + }, + { + "auxiliary_loss_clip": 0.01152327, + "auxiliary_loss_mlp": 0.01111005, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00076056, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.0963180903144374, + "language_loss": 0.87507397, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89770722, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.555704116821289 + }, + { + "auxiliary_loss_clip": 0.01136658, + "auxiliary_loss_mlp": 0.0110985, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00075006, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.4592673072858506, + "language_loss": 0.72022706, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74269211, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.5766711235046387 + }, + { + "auxiliary_loss_clip": 0.0115023, + "auxiliary_loss_mlp": 0.01111216, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.00068545, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 3.126278183692451, + "language_loss": 0.74083191, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76344633, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.4793756008148193 + }, + { + "auxiliary_loss_clip": 0.01122098, + "auxiliary_loss_mlp": 0.01110546, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00049186, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.4367353881518385, + "language_loss": 0.70628327, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72860974, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.818209409713745 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00057268, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7231916804641159, + "language_loss": 0.82444859, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84673631, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.616976499557495 + }, + { + "auxiliary_loss_clip": 0.01166928, + "auxiliary_loss_mlp": 0.00747857, + "balance_loss_clip": 1.00195444, + "balance_loss_mlp": 1.00118375, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.5873802573738516, + "language_loss": 0.6845293, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70367718, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.517677068710327 + }, + { + "auxiliary_loss_clip": 0.01166769, + "auxiliary_loss_mlp": 0.01111565, + "balance_loss_clip": 1.00179994, + "balance_loss_mlp": 1.00046158, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.186277045986723, + "language_loss": 0.58471733, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60750067, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.5470211505889893 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01111285, + "balance_loss_clip": 1.00159681, + "balance_loss_mlp": 1.00056338, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.6825121145321216, + "language_loss": 0.75067353, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77312201, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.6373555660247803 + }, + { + "auxiliary_loss_clip": 0.011206, + "auxiliary_loss_mlp": 0.01110676, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00052643, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 2.601359554566624, + "language_loss": 0.75550497, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77781773, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.6264872550964355 + }, + { + "auxiliary_loss_clip": 0.01133343, + "auxiliary_loss_mlp": 0.01109971, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00058436, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.5322154558281054, + "language_loss": 0.74745357, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.76988667, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 2.6145009994506836 + }, + { + "auxiliary_loss_clip": 0.01133633, + "auxiliary_loss_mlp": 0.0111016, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00058293, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.9885672893640305, + "language_loss": 0.80717653, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.82961446, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.6529428958892822 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01111127, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00059628, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.5948399402929425, + "language_loss": 0.72017694, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.74264568, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.669996976852417 + }, + { + "auxiliary_loss_clip": 0.01129195, + "auxiliary_loss_mlp": 0.01089344, + "balance_loss_clip": 1.00129139, + "balance_loss_mlp": 1.00007963, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7202303093294974, + "language_loss": 0.53273451, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55491984, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 4.788018703460693 + }, + { + "auxiliary_loss_clip": 0.01132837, + "auxiliary_loss_mlp": 0.01111592, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00048876, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 1.847214180690336, + "language_loss": 0.73090732, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75335163, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.733715772628784 + }, + { + "auxiliary_loss_clip": 0.01137005, + "auxiliary_loss_mlp": 0.01111372, + "balance_loss_clip": 1.00189793, + "balance_loss_mlp": 1.00045931, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 1.9340297638840507, + "language_loss": 0.81189156, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83437526, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.570427894592285 + }, + { + "auxiliary_loss_clip": 0.01150015, + "auxiliary_loss_mlp": 0.01111081, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.000646, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 2.483564026392822, + "language_loss": 0.72312438, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74573535, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.5823090076446533 + }, + { + "auxiliary_loss_clip": 0.0113371, + "auxiliary_loss_mlp": 0.01110442, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00048375, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.196016859859694, + "language_loss": 0.74972284, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77216434, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.63643217086792 + }, + { + "auxiliary_loss_clip": 0.01116943, + "auxiliary_loss_mlp": 0.01110373, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00070047, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.7602234626662998, + "language_loss": 0.71002054, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73229367, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.7344906330108643 + }, + { + "auxiliary_loss_clip": 0.01167048, + "auxiliary_loss_mlp": 0.01111908, + "balance_loss_clip": 1.00202572, + "balance_loss_mlp": 1.00051856, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 2.204452942976415, + "language_loss": 0.74539697, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76818645, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.648646354675293 + }, + { + "auxiliary_loss_clip": 0.01150291, + "auxiliary_loss_mlp": 0.0111131, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00049329, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 4.207457074886199, + "language_loss": 0.77541661, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79803252, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 3.9587199687957764 + }, + { + "auxiliary_loss_clip": 0.01166756, + "auxiliary_loss_mlp": 0.0111011, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00062847, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9024637131706448, + "language_loss": 0.70890266, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73167127, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.474879741668701 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01112006, + "balance_loss_clip": 1.00184119, + "balance_loss_mlp": 1.00061655, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 2.236398438690994, + "language_loss": 0.72090316, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74337649, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 4.053060054779053 + }, + { + "auxiliary_loss_clip": 0.01114143, + "auxiliary_loss_mlp": 0.01090661, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.00025308, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7466478172108383, + "language_loss": 0.56967586, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59172392, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.1865153312683105 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.0111067, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00071132, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.574241687397898, + "language_loss": 0.76330251, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.7860781, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.558600902557373 + }, + { + "auxiliary_loss_clip": 0.01131862, + "auxiliary_loss_mlp": 0.01090173, + "balance_loss_clip": 1.00147104, + "balance_loss_mlp": 1.00014615, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8489120963636263, + "language_loss": 0.60498726, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62720764, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.133348226547241 + }, + { + "auxiliary_loss_clip": 0.01167087, + "auxiliary_loss_mlp": 0.01111372, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.00055492, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 1.9867801596495636, + "language_loss": 0.71948576, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74227035, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.485244035720825 + }, + { + "auxiliary_loss_clip": 0.01135359, + "auxiliary_loss_mlp": 0.01111147, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00071192, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.6642380362116391, + "language_loss": 0.73479736, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75726247, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 4.080806255340576 + }, + { + "auxiliary_loss_clip": 0.0115005, + "auxiliary_loss_mlp": 0.0111094, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00050473, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.6428497244992064, + "language_loss": 0.72283006, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74544001, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.6192820072174072 + }, + { + "auxiliary_loss_clip": 0.01134881, + "auxiliary_loss_mlp": 0.01111168, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00044632, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.859118863564614, + "language_loss": 0.74966753, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77212799, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.692812204360962 + }, + { + "auxiliary_loss_clip": 0.01120389, + "auxiliary_loss_mlp": 0.01111052, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00061607, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.3252568047944713, + "language_loss": 0.72163123, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74394566, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 3.076820135116577 + }, + { + "auxiliary_loss_clip": 0.01151964, + "auxiliary_loss_mlp": 0.01110599, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00044954, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.5180673733411296, + "language_loss": 0.80172497, + "learning_rate": 1.53745602625755e-06, + "loss": 0.8243506, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.5112457275390625 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01111429, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.00061214, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.531735136477734, + "language_loss": 0.79028761, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81273919, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.6037533283233643 + }, + { + "auxiliary_loss_clip": 0.011358, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00065577, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.610548840574648, + "language_loss": 0.83789736, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.86035967, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.562236785888672 + }, + { + "auxiliary_loss_clip": 0.01150278, + "auxiliary_loss_mlp": 0.01111218, + "balance_loss_clip": 1.00189865, + "balance_loss_mlp": 1.00059175, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.478964391303724, + "language_loss": 0.69725287, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71986783, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.6023762226104736 + }, + { + "auxiliary_loss_clip": 0.01152303, + "auxiliary_loss_mlp": 0.00747929, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00132871, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 2.0592328797854877, + "language_loss": 0.63489854, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65390086, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.638442039489746 + }, + { + "auxiliary_loss_clip": 0.01162953, + "auxiliary_loss_mlp": 0.00745891, + "balance_loss_clip": 1.0013721, + "balance_loss_mlp": 1.00033975, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7146345204306844, + "language_loss": 0.53883362, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55792207, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.1189706325531006 + }, + { + "auxiliary_loss_clip": 0.01118253, + "auxiliary_loss_mlp": 0.01109619, + "balance_loss_clip": 1.00165772, + "balance_loss_mlp": 1.0006144, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.3654102756037068, + "language_loss": 0.70588481, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.7281636, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.630350351333618 + }, + { + "auxiliary_loss_clip": 0.01101203, + "auxiliary_loss_mlp": 0.01108954, + "balance_loss_clip": 1.0016166, + "balance_loss_mlp": 1.00042605, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.7218095467052374, + "language_loss": 0.6766901, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69879174, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.6847052574157715 + }, + { + "auxiliary_loss_clip": 0.0110294, + "auxiliary_loss_mlp": 0.01111582, + "balance_loss_clip": 1.00163221, + "balance_loss_mlp": 1.00047851, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.5596940439361444, + "language_loss": 0.66240579, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.684551, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.710702419281006 + }, + { + "auxiliary_loss_clip": 0.01166925, + "auxiliary_loss_mlp": 0.01112346, + "balance_loss_clip": 1.00195205, + "balance_loss_mlp": 1.00067055, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.5658036892443323, + "language_loss": 0.7406311, + "learning_rate": 1.534046611017519e-06, + "loss": 0.7634238, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.5400197505950928 + }, + { + "auxiliary_loss_clip": 0.01117724, + "auxiliary_loss_mlp": 0.01111703, + "balance_loss_clip": 1.0017637, + "balance_loss_mlp": 1.00059962, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.3169001630027233, + "language_loss": 0.53754652, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55984074, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.7024168968200684 + }, + { + "auxiliary_loss_clip": 0.01151171, + "auxiliary_loss_mlp": 0.01111248, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00062203, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 3.214996931914429, + "language_loss": 0.64970267, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67232686, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.661588668823242 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01110938, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00059772, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 2.005820597045566, + "language_loss": 0.73593295, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75854248, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.595048427581787 + }, + { + "auxiliary_loss_clip": 0.01166984, + "auxiliary_loss_mlp": 0.01111584, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00057673, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.14011636280868, + "language_loss": 0.73968637, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76247203, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.4933786392211914 + }, + { + "auxiliary_loss_clip": 0.011181, + "auxiliary_loss_mlp": 0.01109462, + "balance_loss_clip": 1.00181401, + "balance_loss_mlp": 1.0005523, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.4365884538158966, + "language_loss": 0.7433548, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76563042, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.652050495147705 + }, + { + "auxiliary_loss_clip": 0.01119942, + "auxiliary_loss_mlp": 0.01110974, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00053871, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.731369082718028, + "language_loss": 0.69818127, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72049046, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.6520867347717285 + }, + { + "auxiliary_loss_clip": 0.01166959, + "auxiliary_loss_mlp": 0.00747849, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00129879, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8381895069710747, + "language_loss": 0.66522998, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68437809, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.525676727294922 + }, + { + "auxiliary_loss_clip": 0.01134041, + "auxiliary_loss_mlp": 0.01111022, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00068176, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.8907327246098657, + "language_loss": 0.72279227, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74524295, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.5530824661254883 + }, + { + "auxiliary_loss_clip": 0.01133018, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00133145, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.2904158012735067, + "language_loss": 0.70520937, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72401857, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.5854601860046387 + }, + { + "auxiliary_loss_clip": 0.01151802, + "auxiliary_loss_mlp": 0.01111424, + "balance_loss_clip": 1.00184715, + "balance_loss_mlp": 1.000512, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 1.971630043817265, + "language_loss": 0.70174557, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72437781, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.4933481216430664 + }, + { + "auxiliary_loss_clip": 0.01116615, + "auxiliary_loss_mlp": 0.01111231, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00060463, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 1.8191703533808505, + "language_loss": 0.6904304, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71270883, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 3.9928550720214844 + }, + { + "auxiliary_loss_clip": 0.01102508, + "auxiliary_loss_mlp": 0.01111941, + "balance_loss_clip": 1.00151038, + "balance_loss_mlp": 1.00055194, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.810559727785109, + "language_loss": 0.6979121, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.72005653, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.761479616165161 + }, + { + "auxiliary_loss_clip": 0.01150293, + "auxiliary_loss_mlp": 0.01109662, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00037074, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.013267678684806, + "language_loss": 0.77757335, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.80017292, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.502976179122925 + }, + { + "auxiliary_loss_clip": 0.011368, + "auxiliary_loss_mlp": 0.01110134, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.00055707, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.6834574957436437, + "language_loss": 0.79375392, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81622326, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.595906972885132 + }, + { + "auxiliary_loss_clip": 0.01117103, + "auxiliary_loss_mlp": 0.01110341, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00038242, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.4841339793157085, + "language_loss": 0.66155052, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68382502, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.6202962398529053 + }, + { + "auxiliary_loss_clip": 0.0113489, + "auxiliary_loss_mlp": 0.01110623, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00066459, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.110029533618281, + "language_loss": 0.79760814, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82006329, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.611081600189209 + }, + { + "auxiliary_loss_clip": 0.0113541, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00120461, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.4052251013306618, + "language_loss": 0.70691538, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72574633, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 4.0679004192352295 + }, + { + "auxiliary_loss_clip": 0.01118345, + "auxiliary_loss_mlp": 0.01110346, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00057852, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 2.665007063321844, + "language_loss": 0.83387446, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85616142, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.6565723419189453 + }, + { + "auxiliary_loss_clip": 0.0115188, + "auxiliary_loss_mlp": 0.0111096, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00061941, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5492211191581833, + "language_loss": 0.76720023, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78982866, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.52810001373291 + }, + { + "auxiliary_loss_clip": 0.01105566, + "auxiliary_loss_mlp": 0.01110049, + "balance_loss_clip": 1.00177526, + "balance_loss_mlp": 1.00047219, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.8760793310430988, + "language_loss": 0.68876624, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71092248, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 4.0559163093566895 + }, + { + "auxiliary_loss_clip": 0.01166734, + "auxiliary_loss_mlp": 0.01109965, + "balance_loss_clip": 1.0018841, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.610299560187505, + "language_loss": 0.60303426, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62580121, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.4492173194885254 + }, + { + "auxiliary_loss_clip": 0.0111867, + "auxiliary_loss_mlp": 0.01109755, + "balance_loss_clip": 1.00184667, + "balance_loss_mlp": 1.00075054, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.501646699597735, + "language_loss": 0.65352058, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67580485, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.6240477561950684 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01110229, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00055647, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.508013924710485, + "language_loss": 0.74033844, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76263142, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.6110925674438477 + }, + { + "auxiliary_loss_clip": 0.01137161, + "auxiliary_loss_mlp": 0.01109874, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00039172, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.920487112346746, + "language_loss": 0.83100593, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85347629, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 3.980283498764038 + }, + { + "auxiliary_loss_clip": 0.0113507, + "auxiliary_loss_mlp": 0.01109938, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.0005517, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.639147870857385, + "language_loss": 0.7911799, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81362998, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.544563055038452 + }, + { + "auxiliary_loss_clip": 0.01166731, + "auxiliary_loss_mlp": 0.01109251, + "balance_loss_clip": 1.00193965, + "balance_loss_mlp": 1.00053203, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 1.9774932506916032, + "language_loss": 0.74110717, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76386702, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.475780487060547 + }, + { + "auxiliary_loss_clip": 0.01120024, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.00045693, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.967915805862405, + "language_loss": 0.75951087, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78182673, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.6045081615448 + }, + { + "auxiliary_loss_clip": 0.01104086, + "auxiliary_loss_mlp": 0.01110312, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00044918, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.9874341305454257, + "language_loss": 0.79022592, + "learning_rate": 1.523448741022722e-06, + "loss": 0.81236988, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.629201889038086 + }, + { + "auxiliary_loss_clip": 0.01118872, + "auxiliary_loss_mlp": 0.01111415, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00040698, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 1.6816863988529356, + "language_loss": 0.66106069, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68336356, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.632910966873169 + }, + { + "auxiliary_loss_clip": 0.01150016, + "auxiliary_loss_mlp": 0.01109883, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.00040126, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.498275019275633, + "language_loss": 0.77812952, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80072856, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.534106969833374 + }, + { + "auxiliary_loss_clip": 0.01150178, + "auxiliary_loss_mlp": 0.01110911, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00057054, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.7262931527255971, + "language_loss": 0.72909111, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75170201, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.562845468521118 + }, + { + "auxiliary_loss_clip": 0.01135552, + "auxiliary_loss_mlp": 0.01109657, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.0004611, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.9610311391206725, + "language_loss": 0.74959528, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.7720474, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.5665335655212402 + }, + { + "auxiliary_loss_clip": 0.01151714, + "auxiliary_loss_mlp": 0.0074793, + "balance_loss_clip": 1.00190485, + "balance_loss_mlp": 1.00134325, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.1731709987637093, + "language_loss": 0.78315997, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80215639, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.542253017425537 + }, + { + "auxiliary_loss_clip": 0.01166741, + "auxiliary_loss_mlp": 0.01110841, + "balance_loss_clip": 1.00185668, + "balance_loss_mlp": 1.00040579, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.29290344674409, + "language_loss": 0.76890022, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.7916761, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.4962480068206787 + }, + { + "auxiliary_loss_clip": 0.01151233, + "auxiliary_loss_mlp": 0.01111163, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00044107, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.6583948851038388, + "language_loss": 0.74061739, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76324129, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.5122780799865723 + }, + { + "auxiliary_loss_clip": 0.01103446, + "auxiliary_loss_mlp": 0.0111099, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00045872, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.4010122609899907, + "language_loss": 0.72065544, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74279976, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.658174991607666 + }, + { + "auxiliary_loss_clip": 0.01133516, + "auxiliary_loss_mlp": 0.01110847, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00050735, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.016734887002303, + "language_loss": 0.82141459, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84385824, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.576516628265381 + }, + { + "auxiliary_loss_clip": 0.0115196, + "auxiliary_loss_mlp": 0.0111001, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00043297, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.5477232231015123, + "language_loss": 0.80824435, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83086407, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 2.524251699447632 + }, + { + "auxiliary_loss_clip": 0.01150557, + "auxiliary_loss_mlp": 0.01110257, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00039339, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.8393092022455702, + "language_loss": 0.77030349, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79291165, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.5276639461517334 + }, + { + "auxiliary_loss_clip": 0.01117945, + "auxiliary_loss_mlp": 0.01110047, + "balance_loss_clip": 1.00174272, + "balance_loss_mlp": 1.00047028, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 3.718148779413338, + "language_loss": 0.70506513, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72734505, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.5856544971466064 + }, + { + "auxiliary_loss_clip": 0.01133616, + "auxiliary_loss_mlp": 0.01109971, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00048971, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.695070410235501, + "language_loss": 0.72031301, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74274886, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.5875275135040283 + }, + { + "auxiliary_loss_clip": 0.01132457, + "auxiliary_loss_mlp": 0.01110706, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00046158, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.7469568529503678, + "language_loss": 0.78722173, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80965334, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.565742254257202 + }, + { + "auxiliary_loss_clip": 0.01120219, + "auxiliary_loss_mlp": 0.00747931, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00137925, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 1.9321880280407375, + "language_loss": 0.75802672, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77670825, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.6275582313537598 + }, + { + "auxiliary_loss_clip": 0.01166891, + "auxiliary_loss_mlp": 0.01109933, + "balance_loss_clip": 1.00201011, + "balance_loss_mlp": 1.00054657, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.0454054827178894, + "language_loss": 0.81232065, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83508885, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.45713472366333 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01110374, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00060606, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.5312473786437975, + "language_loss": 0.76576018, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78789353, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.640650749206543 + }, + { + "auxiliary_loss_clip": 0.01116486, + "auxiliary_loss_mlp": 0.01110204, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00043583, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 2.48763351640363, + "language_loss": 0.66931391, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.69158089, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.6005871295928955 + }, + { + "auxiliary_loss_clip": 0.01166876, + "auxiliary_loss_mlp": 0.01110585, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.00053108, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 2.7168684440831767, + "language_loss": 0.77858865, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80136323, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.5193004608154297 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01089187, + "balance_loss_clip": 1.00127101, + "balance_loss_mlp": 1.00030446, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9209336549060454, + "language_loss": 0.65102553, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67305541, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.1654481887817383 + }, + { + "auxiliary_loss_clip": 0.01118728, + "auxiliary_loss_mlp": 0.01109954, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00056791, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.7573653188501588, + "language_loss": 0.61061108, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63289785, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 4.234946250915527 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01111057, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00052631, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 2.285127909745458, + "language_loss": 0.82439107, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84716928, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.4886250495910645 + }, + { + "auxiliary_loss_clip": 0.01135098, + "auxiliary_loss_mlp": 0.0111024, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00047231, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.79153835296727, + "language_loss": 0.72963417, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75208753, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.5746099948883057 + }, + { + "auxiliary_loss_clip": 0.01120786, + "auxiliary_loss_mlp": 0.00747882, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00121343, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.095832359532062, + "language_loss": 0.8293283, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84801495, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.6264007091522217 + }, + { + "auxiliary_loss_clip": 0.0114983, + "auxiliary_loss_mlp": 0.01108046, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00047183, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7103527054533798, + "language_loss": 0.76536012, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78793889, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.5920844078063965 + }, + { + "auxiliary_loss_clip": 0.01133235, + "auxiliary_loss_mlp": 0.01109513, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00050831, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.7622518895878845, + "language_loss": 0.71910542, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74153292, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.6056370735168457 + }, + { + "auxiliary_loss_clip": 0.01099732, + "auxiliary_loss_mlp": 0.01109448, + "balance_loss_clip": 1.00155306, + "balance_loss_mlp": 1.00044346, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.6860028075903266, + "language_loss": 0.79388064, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81597245, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 4.106472969055176 + }, + { + "auxiliary_loss_clip": 0.01099365, + "auxiliary_loss_mlp": 0.01110715, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.0004704, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 1.8243837872425914, + "language_loss": 0.88425887, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90635967, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.6064372062683105 + }, + { + "auxiliary_loss_clip": 0.01130345, + "auxiliary_loss_mlp": 0.01089289, + "balance_loss_clip": 1.00140977, + "balance_loss_mlp": 1.00040615, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.9169644633238353, + "language_loss": 0.57891721, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.6011135, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 4.4702723026275635 + }, + { + "auxiliary_loss_clip": 0.01150372, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00104368, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.1658039209148243, + "language_loss": 0.75981891, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77879989, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.5415079593658447 + }, + { + "auxiliary_loss_clip": 0.01133568, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.0004518, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.9119514087680094, + "language_loss": 0.77544534, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79786789, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.57020902633667 + }, + { + "auxiliary_loss_clip": 0.01149996, + "auxiliary_loss_mlp": 0.01109451, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00044656, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.8121255031787222, + "language_loss": 0.83437598, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85697043, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.4958465099334717 + }, + { + "auxiliary_loss_clip": 0.01151516, + "auxiliary_loss_mlp": 0.01110367, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.0005033, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.6179936005297033, + "language_loss": 0.74188554, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76450431, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.524937391281128 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01109848, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00055718, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.549480390779456, + "language_loss": 0.78397018, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80673754, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 3.914116859436035 + }, + { + "auxiliary_loss_clip": 0.01137283, + "auxiliary_loss_mlp": 0.01109981, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00040424, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 1.8045637437918083, + "language_loss": 0.7361201, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.75859272, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.546334743499756 + }, + { + "auxiliary_loss_clip": 0.01120209, + "auxiliary_loss_mlp": 0.01110425, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00037098, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.077048430820037, + "language_loss": 0.82391429, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84622061, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.6008076667785645 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01110811, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00037503, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.788304720355969, + "language_loss": 0.79707402, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81939149, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.62485408782959 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01109874, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.00058281, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.8613805009429683, + "language_loss": 0.69577491, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71786654, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.653691291809082 + }, + { + "auxiliary_loss_clip": 0.01135561, + "auxiliary_loss_mlp": 0.01110907, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.0005672, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.0135916282572612, + "language_loss": 0.65656817, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67903286, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.5756680965423584 + }, + { + "auxiliary_loss_clip": 0.01135275, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00051832, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.5888861877059035, + "language_loss": 0.81449699, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83695734, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.6291682720184326 + }, + { + "auxiliary_loss_clip": 0.01135037, + "auxiliary_loss_mlp": 0.01110191, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00051856, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5688775011717815, + "language_loss": 0.69053841, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71299064, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.5656614303588867 + }, + { + "auxiliary_loss_clip": 0.01135109, + "auxiliary_loss_mlp": 0.01110276, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00050783, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 2.2900637194750897, + "language_loss": 0.82667696, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84913075, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.596278190612793 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.011105, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00044596, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.846694585676008, + "language_loss": 0.81903291, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84151137, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.5836594104766846 + }, + { + "auxiliary_loss_clip": 0.01100624, + "auxiliary_loss_mlp": 0.01110696, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00045156, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.9518551030926572, + "language_loss": 0.74349213, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76560533, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.01118449, + "auxiliary_loss_mlp": 0.01110274, + "balance_loss_clip": 1.00156057, + "balance_loss_mlp": 1.00050557, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.7612970184632202, + "language_loss": 0.63566756, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65795469, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.760075330734253 + }, + { + "auxiliary_loss_clip": 0.01117841, + "auxiliary_loss_mlp": 0.00747503, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00111187, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.731742697036419, + "language_loss": 0.7685585, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78721195, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.637439727783203 + }, + { + "auxiliary_loss_clip": 0.01135249, + "auxiliary_loss_mlp": 0.01109638, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00044262, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.581258443507296, + "language_loss": 0.62394965, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64639854, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.6008987426757812 + }, + { + "auxiliary_loss_clip": 0.01150208, + "auxiliary_loss_mlp": 0.01110441, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00048232, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.2185595462976178, + "language_loss": 0.76034343, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78294986, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.536869764328003 + }, + { + "auxiliary_loss_clip": 0.01135214, + "auxiliary_loss_mlp": 0.01109477, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00047195, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.8729812584851884, + "language_loss": 0.75780046, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.78024733, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.604104995727539 + }, + { + "auxiliary_loss_clip": 0.01120077, + "auxiliary_loss_mlp": 0.01109999, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.0006125, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.9062545499622994, + "language_loss": 0.75677514, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77907586, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.6125245094299316 + }, + { + "auxiliary_loss_clip": 0.01133252, + "auxiliary_loss_mlp": 0.01110252, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00048399, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.9497852209597726, + "language_loss": 0.70860028, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.73103535, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.6257355213165283 + }, + { + "auxiliary_loss_clip": 0.01133259, + "auxiliary_loss_mlp": 0.00747863, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00124407, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.5577978521072033, + "language_loss": 0.80420589, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82301706, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.5686442852020264 + }, + { + "auxiliary_loss_clip": 0.01133377, + "auxiliary_loss_mlp": 0.01109112, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00039363, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 2.412880616034699, + "language_loss": 0.67367029, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69609523, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.6447982788085938 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01109915, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00052857, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.9209292732358885, + "language_loss": 0.88929844, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.9115932, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.591066360473633 + }, + { + "auxiliary_loss_clip": 0.01149092, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00045538, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.6606700316231031, + "language_loss": 0.87235677, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89493465, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.5280139446258545 + }, + { + "auxiliary_loss_clip": 0.01150103, + "auxiliary_loss_mlp": 0.01110199, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00043106, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 1.934416983074216, + "language_loss": 0.77235043, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79495341, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.4886226654052734 + }, + { + "auxiliary_loss_clip": 0.01122259, + "auxiliary_loss_mlp": 0.01110228, + "balance_loss_clip": 1.00198424, + "balance_loss_mlp": 1.00055575, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.7845237757875334, + "language_loss": 0.64125049, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66357535, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.6018314361572266 + }, + { + "auxiliary_loss_clip": 0.01149445, + "auxiliary_loss_mlp": 0.01109087, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00046396, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.8676000729632085, + "language_loss": 0.77018213, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79276747, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 3.9725780487060547 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.00747738, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.00106072, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1701023129566894, + "language_loss": 0.7501632, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.76863801, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6634929180145264 + }, + { + "auxiliary_loss_clip": 0.01118428, + "auxiliary_loss_mlp": 0.01109969, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.00058293, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 2.7789082649832197, + "language_loss": 0.76113117, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78341508, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.6488828659057617 + }, + { + "auxiliary_loss_clip": 0.01116223, + "auxiliary_loss_mlp": 0.01108487, + "balance_loss_clip": 1.0016247, + "balance_loss_mlp": 1.00034022, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.915210398771567, + "language_loss": 0.70486927, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72711635, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.6374289989471436 + }, + { + "auxiliary_loss_clip": 0.01103983, + "auxiliary_loss_mlp": 0.01108715, + "balance_loss_clip": 1.0015955, + "balance_loss_mlp": 1.00037813, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.7859998083279705, + "language_loss": 0.78072965, + "learning_rate": 1.500032899685832e-06, + "loss": 0.80285656, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.6730265617370605 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01111053, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00061774, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.7200750763821866, + "language_loss": 0.70631039, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72877139, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.6315088272094727 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.01110377, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00060904, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.3533713576720758, + "language_loss": 0.67276418, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69521648, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 4.151219367980957 + }, + { + "auxiliary_loss_clip": 0.0113521, + "auxiliary_loss_mlp": 0.01109556, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.00055075, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.8571658820719226, + "language_loss": 0.78114295, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.8035906, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.5583248138427734 + }, + { + "auxiliary_loss_clip": 0.01133161, + "auxiliary_loss_mlp": 0.0110936, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00035548, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.394715566000499, + "language_loss": 0.7195369, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74196219, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.6423518657684326 + }, + { + "auxiliary_loss_clip": 0.01133402, + "auxiliary_loss_mlp": 0.01109443, + "balance_loss_clip": 1.0019592, + "balance_loss_mlp": 1.00043762, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.8917942754346644, + "language_loss": 0.66413617, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68656462, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 4.008826971054077 + }, + { + "auxiliary_loss_clip": 0.01086328, + "auxiliary_loss_mlp": 0.00747653, + "balance_loss_clip": 1.00161934, + "balance_loss_mlp": 1.00098825, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.3699548835146516, + "language_loss": 0.74998266, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.76832247, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.720848560333252 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01110407, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.0006392, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.754338266619954, + "language_loss": 0.74263215, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76474339, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 2.97299861907959 + }, + { + "auxiliary_loss_clip": 0.01102644, + "auxiliary_loss_mlp": 0.01109326, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00041628, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.1090049844452396, + "language_loss": 0.72017634, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74229604, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.657885789871216 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01109794, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00050259, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.050692112899229, + "language_loss": 0.74499762, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76727849, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 4.056415319442749 + }, + { + "auxiliary_loss_clip": 0.01151968, + "auxiliary_loss_mlp": 0.01111062, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.0005312, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.1611507147545446, + "language_loss": 0.79333282, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81596309, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.5314602851867676 + }, + { + "auxiliary_loss_clip": 0.011501, + "auxiliary_loss_mlp": 0.0111031, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00054216, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 2.520491703229493, + "language_loss": 0.84776652, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87037063, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.5743675231933594 + }, + { + "auxiliary_loss_clip": 0.01131957, + "auxiliary_loss_mlp": 0.01088526, + "balance_loss_clip": 1.00146687, + "balance_loss_mlp": 1.00002539, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7125522849316661, + "language_loss": 0.60021937, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62242419, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.2232677936553955 + }, + { + "auxiliary_loss_clip": 0.01136647, + "auxiliary_loss_mlp": 0.01110306, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00044262, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 1.8860984667874297, + "language_loss": 0.77955246, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80202198, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.560389518737793 + }, + { + "auxiliary_loss_clip": 0.01151966, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00042403, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.6723327603665383, + "language_loss": 0.75759667, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78020585, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.5459187030792236 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01110594, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00053978, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.1074646786324482, + "language_loss": 0.81189436, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83449906, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.498208999633789 + }, + { + "auxiliary_loss_clip": 0.01135473, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00119901, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.6675929951511699, + "language_loss": 0.71427065, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.7331022, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.7857906818389893 + }, + { + "auxiliary_loss_clip": 0.01149781, + "auxiliary_loss_mlp": 0.01109189, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.00047016, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4074998508109298, + "language_loss": 0.57531011, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59789985, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.5857369899749756 + }, + { + "auxiliary_loss_clip": 0.01151461, + "auxiliary_loss_mlp": 0.01109307, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00058818, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7372972703454295, + "language_loss": 0.77343476, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79604244, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.6228275299072266 + }, + { + "auxiliary_loss_clip": 0.01150042, + "auxiliary_loss_mlp": 0.0110975, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00045931, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.0061308648653644, + "language_loss": 0.82248032, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84507823, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.6062872409820557 + }, + { + "auxiliary_loss_clip": 0.01150415, + "auxiliary_loss_mlp": 0.01109553, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00064349, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.0024782408834767, + "language_loss": 0.79080975, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81340945, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.5116360187530518 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.00747781, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00107574, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.7468726239089967, + "language_loss": 0.74156523, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76021302, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.620628595352173 + }, + { + "auxiliary_loss_clip": 0.01166898, + "auxiliary_loss_mlp": 0.01110419, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00046074, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.113471503006175, + "language_loss": 0.66515905, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68793219, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.55428409576416 + }, + { + "auxiliary_loss_clip": 0.01133256, + "auxiliary_loss_mlp": 0.01110138, + "balance_loss_clip": 1.00194156, + "balance_loss_mlp": 1.000561, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.2564943084130484, + "language_loss": 0.77145523, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79388916, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.6374459266662598 + }, + { + "auxiliary_loss_clip": 0.01145812, + "auxiliary_loss_mlp": 0.01088583, + "balance_loss_clip": 1.00146222, + "balance_loss_mlp": 1.0000819, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8298484093890961, + "language_loss": 0.64526814, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66761208, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 3.004453420639038 + }, + { + "auxiliary_loss_clip": 0.01150252, + "auxiliary_loss_mlp": 0.01109798, + "balance_loss_clip": 1.00173306, + "balance_loss_mlp": 1.00050664, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.6526246871660542, + "language_loss": 0.69170713, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71430761, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.5527830123901367 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01110009, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00052714, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5026780838774054, + "language_loss": 0.79614723, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81861722, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.6132009029388428 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01108766, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00052392, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.5586605496599855, + "language_loss": 0.70775807, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.73002869, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.630772590637207 + }, + { + "auxiliary_loss_clip": 0.01116758, + "auxiliary_loss_mlp": 0.01109895, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00041366, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.9373864928997075, + "language_loss": 0.69343442, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71570092, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.5882327556610107 + }, + { + "auxiliary_loss_clip": 0.01149954, + "auxiliary_loss_mlp": 0.01108788, + "balance_loss_clip": 1.00194156, + "balance_loss_mlp": 1.00054574, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.7505729668482966, + "language_loss": 0.53142583, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55401319, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.5531675815582275 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.0108913, + "balance_loss_clip": 1.00142729, + "balance_loss_mlp": 1.00024748, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6585841433094313, + "language_loss": 0.54556674, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56760991, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.2490110397338867 + }, + { + "auxiliary_loss_clip": 0.01117037, + "auxiliary_loss_mlp": 0.01108822, + "balance_loss_clip": 1.00165021, + "balance_loss_mlp": 1.0004847, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.8976758538582583, + "language_loss": 0.75019246, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.77245104, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.6292169094085693 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01110267, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00049913, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6745026396271172, + "language_loss": 0.77811813, + "learning_rate": 1.487975602873434e-06, + "loss": 0.80039918, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.597862958908081 + }, + { + "auxiliary_loss_clip": 0.01104336, + "auxiliary_loss_mlp": 0.01109872, + "balance_loss_clip": 1.0018121, + "balance_loss_mlp": 1.00048542, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.652236322618382, + "language_loss": 0.79054403, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81268609, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 4.043051481246948 + }, + { + "auxiliary_loss_clip": 0.01149881, + "auxiliary_loss_mlp": 0.01109366, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00045669, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 2.00390036911468, + "language_loss": 0.83314097, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85573339, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.5985546112060547 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01109591, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00049102, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.8218313758591096, + "language_loss": 0.70932329, + "learning_rate": 1.486846243389939e-06, + "loss": 0.73175156, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.5753283500671387 + }, + { + "auxiliary_loss_clip": 0.011518, + "auxiliary_loss_mlp": 0.01111865, + "balance_loss_clip": 1.00186932, + "balance_loss_mlp": 1.00047565, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.2139570482893, + "language_loss": 0.64344382, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66608047, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.6206068992614746 + }, + { + "auxiliary_loss_clip": 0.01166735, + "auxiliary_loss_mlp": 0.01109859, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00047302, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.6136146959242512, + "language_loss": 0.72179008, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74455607, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.5162534713745117 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.01109569, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00056362, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 2.1118795760057836, + "language_loss": 0.84443855, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86720043, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.5086607933044434 + }, + { + "auxiliary_loss_clip": 0.01095383, + "auxiliary_loss_mlp": 0.01088731, + "balance_loss_clip": 1.00150132, + "balance_loss_mlp": 1.00022972, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8011329864433346, + "language_loss": 0.58204567, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60388684, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.073939323425293 + }, + { + "auxiliary_loss_clip": 0.01086012, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_clip": 1.00180817, + "balance_loss_mlp": 1.00032568, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 1.575069697293792, + "language_loss": 0.76783442, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.78978688, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 4.119044065475464 + }, + { + "auxiliary_loss_clip": 0.01116578, + "auxiliary_loss_mlp": 0.01109241, + "balance_loss_clip": 1.00158966, + "balance_loss_mlp": 1.0005219, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.8008146934311522, + "language_loss": 0.78017634, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80243456, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.8146777153015137 + }, + { + "auxiliary_loss_clip": 0.0115035, + "auxiliary_loss_mlp": 0.01110638, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00058365, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.4738898924683181, + "language_loss": 0.72603011, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74864, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.64314603805542 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01108876, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00053835, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.7235124567376199, + "language_loss": 0.69602764, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71863115, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 3.9532527923583984 + }, + { + "auxiliary_loss_clip": 0.01150016, + "auxiliary_loss_mlp": 0.01109529, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00052428, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.7736134482059136, + "language_loss": 0.74991298, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77250838, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.5951335430145264 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01110499, + "balance_loss_clip": 1.00210178, + "balance_loss_mlp": 1.00054026, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.5708547614893094, + "language_loss": 0.66986811, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69232619, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.7808103561401367 + }, + { + "auxiliary_loss_clip": 0.0107136, + "auxiliary_loss_mlp": 0.01109272, + "balance_loss_clip": 1.00168407, + "balance_loss_mlp": 1.00045776, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 1.896190230514715, + "language_loss": 0.76375818, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78556454, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.7438957691192627 + }, + { + "auxiliary_loss_clip": 0.01162953, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_clip": 1.00147867, + "balance_loss_mlp": 1.00003934, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9219466182574249, + "language_loss": 0.733639, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75615394, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 4.481431722640991 + }, + { + "auxiliary_loss_clip": 0.01133462, + "auxiliary_loss_mlp": 0.01109843, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00045693, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6876971233091482, + "language_loss": 0.69514686, + "learning_rate": 1.481954380961799e-06, + "loss": 0.7175799, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.6498048305511475 + }, + { + "auxiliary_loss_clip": 0.01151769, + "auxiliary_loss_mlp": 0.01111836, + "balance_loss_clip": 1.00210214, + "balance_loss_mlp": 1.00063801, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.7970740441013406, + "language_loss": 0.65539777, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67803389, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.5509958267211914 + }, + { + "auxiliary_loss_clip": 0.01119903, + "auxiliary_loss_mlp": 0.01110593, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00053942, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.5444555841994942, + "language_loss": 0.72857678, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75088173, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.7076196670532227 + }, + { + "auxiliary_loss_clip": 0.0111505, + "auxiliary_loss_mlp": 0.00747585, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00099277, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 1.78553335901347, + "language_loss": 0.79763687, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.8162632, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.697138786315918 + }, + { + "auxiliary_loss_clip": 0.01117872, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00056231, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.7745717463239141, + "language_loss": 0.67415643, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69642985, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.578005313873291 + }, + { + "auxiliary_loss_clip": 0.01133602, + "auxiliary_loss_mlp": 0.01109585, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00058031, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.5364411614782914, + "language_loss": 0.78598559, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.8084175, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.5693166255950928 + }, + { + "auxiliary_loss_clip": 0.01135338, + "auxiliary_loss_mlp": 0.01109744, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00045288, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.714320380986478, + "language_loss": 0.82508159, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84753239, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.5691230297088623 + }, + { + "auxiliary_loss_clip": 0.011354, + "auxiliary_loss_mlp": 0.0110921, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.0004915, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 2.2710505610099894, + "language_loss": 0.77536535, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79781139, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.566465377807617 + }, + { + "auxiliary_loss_clip": 0.01150071, + "auxiliary_loss_mlp": 0.01109664, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00065923, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.4139437629898826, + "language_loss": 0.78888249, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81147981, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.6111738681793213 + }, + { + "auxiliary_loss_clip": 0.01133592, + "auxiliary_loss_mlp": 0.01109087, + "balance_loss_clip": 1.00177896, + "balance_loss_mlp": 1.00046372, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.8600710761411503, + "language_loss": 0.77350605, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79593283, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.593614101409912 + }, + { + "auxiliary_loss_clip": 0.0115192, + "auxiliary_loss_mlp": 0.01109985, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.0005033, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.3512649624484037, + "language_loss": 0.82881737, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.85143638, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.522113561630249 + }, + { + "auxiliary_loss_clip": 0.01152045, + "auxiliary_loss_mlp": 0.01109554, + "balance_loss_clip": 1.00206625, + "balance_loss_mlp": 1.00054955, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9218970665746415, + "language_loss": 0.80907345, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.83168948, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.511597156524658 + }, + { + "auxiliary_loss_clip": 0.01151402, + "auxiliary_loss_mlp": 0.00747542, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00092125, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 2.0837126900159335, + "language_loss": 0.76976013, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78874958, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.5618982315063477 + }, + { + "auxiliary_loss_clip": 0.01136892, + "auxiliary_loss_mlp": 0.01110405, + "balance_loss_clip": 1.00196278, + "balance_loss_mlp": 1.0006367, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.8915544474130805, + "language_loss": 0.76170564, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78417861, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.567448377609253 + }, + { + "auxiliary_loss_clip": 0.0115157, + "auxiliary_loss_mlp": 0.01109557, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00055218, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 2.2970433990235373, + "language_loss": 0.66978443, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.69239569, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.5188403129577637 + }, + { + "auxiliary_loss_clip": 0.01135005, + "auxiliary_loss_mlp": 0.01109342, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00062323, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 1.8834786603824643, + "language_loss": 0.71486425, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73730767, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.6218960285186768 + }, + { + "auxiliary_loss_clip": 0.01105575, + "auxiliary_loss_mlp": 0.00747632, + "balance_loss_clip": 1.00183308, + "balance_loss_mlp": 1.00095201, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.9448718716654, + "language_loss": 0.69974691, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.718279, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.9207990169525146 + }, + { + "auxiliary_loss_clip": 0.0110409, + "auxiliary_loss_mlp": 0.01110865, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00061989, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.762060930598396, + "language_loss": 0.64120448, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66335404, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.862445116043091 + }, + { + "auxiliary_loss_clip": 0.01166724, + "auxiliary_loss_mlp": 0.01108758, + "balance_loss_clip": 1.00201356, + "balance_loss_mlp": 1.00051618, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5230628876769685, + "language_loss": 0.6970185, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71977329, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.5862877368927 + }, + { + "auxiliary_loss_clip": 0.01100984, + "auxiliary_loss_mlp": 0.01108687, + "balance_loss_clip": 1.00180161, + "balance_loss_mlp": 1.00054002, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.6931462451886414, + "language_loss": 0.76979876, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.79189539, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.667670726776123 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01110573, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00051868, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.6405319531549587, + "language_loss": 0.69005162, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71249413, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.574465274810791 + }, + { + "auxiliary_loss_clip": 0.01146912, + "auxiliary_loss_mlp": 0.01088992, + "balance_loss_clip": 1.0016861, + "balance_loss_mlp": 1.00049078, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8583511622685548, + "language_loss": 0.64297521, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66533422, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.034221649169922 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.01110223, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00055003, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.7679478585415636, + "language_loss": 0.74103129, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76333046, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.6170573234558105 + }, + { + "auxiliary_loss_clip": 0.011466, + "auxiliary_loss_mlp": 0.01089432, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.00054979, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6687357862574402, + "language_loss": 0.52021408, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54257441, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 4.6060426235198975 + }, + { + "auxiliary_loss_clip": 0.01162871, + "auxiliary_loss_mlp": 0.01088388, + "balance_loss_clip": 1.00144935, + "balance_loss_mlp": 1.00026894, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8541266010121468, + "language_loss": 0.54225552, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56476808, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.0080575942993164 + }, + { + "auxiliary_loss_clip": 0.01135249, + "auxiliary_loss_mlp": 0.01108901, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.00056386, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.589536151458575, + "language_loss": 0.65709376, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67953527, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.6630642414093018 + }, + { + "auxiliary_loss_clip": 0.01102458, + "auxiliary_loss_mlp": 0.01111099, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00075877, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.175137464992649, + "language_loss": 0.67823684, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.7003724, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.673682689666748 + }, + { + "auxiliary_loss_clip": 0.01150162, + "auxiliary_loss_mlp": 0.01110237, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.0005641, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.1046467791281294, + "language_loss": 0.77594519, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79854918, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.624526023864746 + }, + { + "auxiliary_loss_clip": 0.01152179, + "auxiliary_loss_mlp": 0.01109468, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00046337, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.5194378782955809, + "language_loss": 0.75864279, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78125918, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.63611102104187 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01110018, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.0004406, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 1.9284802138439041, + "language_loss": 0.68408775, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70619786, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.6769778728485107 + }, + { + "auxiliary_loss_clip": 0.01133889, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00065303, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.251560902979069, + "language_loss": 0.70130777, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72373372, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 4.056702613830566 + }, + { + "auxiliary_loss_clip": 0.01134609, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 1.00052977, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.7517788069544715, + "language_loss": 0.77171421, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79415184, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.5597946643829346 + }, + { + "auxiliary_loss_clip": 0.01099792, + "auxiliary_loss_mlp": 0.01109793, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00059736, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 2.1118270372514645, + "language_loss": 0.75826478, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78036064, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.708989381790161 + }, + { + "auxiliary_loss_clip": 0.0108338, + "auxiliary_loss_mlp": 0.0110843, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00056911, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.8141217562525342, + "language_loss": 0.61909139, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64100951, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 4.3531494140625 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01109608, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.0006032, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6678693226181776, + "language_loss": 0.72493625, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74736798, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.751789093017578 + }, + { + "auxiliary_loss_clip": 0.01099613, + "auxiliary_loss_mlp": 0.01109507, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00050211, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.9127933082692261, + "language_loss": 0.66979736, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69188857, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.7417964935302734 + }, + { + "auxiliary_loss_clip": 0.01151901, + "auxiliary_loss_mlp": 0.0111043, + "balance_loss_clip": 1.00199246, + "balance_loss_mlp": 1.00056672, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 1.7628412151196735, + "language_loss": 0.88527191, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90789521, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.561339855194092 + }, + { + "auxiliary_loss_clip": 0.01166621, + "auxiliary_loss_mlp": 0.01109338, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00061953, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.6475148834728714, + "language_loss": 0.72101855, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74377811, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 4.368335723876953 + }, + { + "auxiliary_loss_clip": 0.01132786, + "auxiliary_loss_mlp": 0.01109804, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 2.356953070948426, + "language_loss": 0.89907128, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.92149723, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.6495697498321533 + }, + { + "auxiliary_loss_clip": 0.01150208, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00066113, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.5708868481766483, + "language_loss": 0.70725936, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72985429, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.6387953758239746 + }, + { + "auxiliary_loss_clip": 0.0115014, + "auxiliary_loss_mlp": 0.01110084, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00050652, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.735960999817832, + "language_loss": 0.78066087, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80326313, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.5447752475738525 + }, + { + "auxiliary_loss_clip": 0.01132954, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00088072, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.403662291336668, + "language_loss": 0.73786074, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.7602939, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.603806972503662 + }, + { + "auxiliary_loss_clip": 0.01133108, + "auxiliary_loss_mlp": 0.00747697, + "balance_loss_clip": 1.00167596, + "balance_loss_mlp": 1.00103104, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 3.491227904898135, + "language_loss": 0.79000461, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80881268, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.673769474029541 + }, + { + "auxiliary_loss_clip": 0.01118791, + "auxiliary_loss_mlp": 0.01109296, + "balance_loss_clip": 1.00174308, + "balance_loss_mlp": 1.00048137, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.4326409551374184, + "language_loss": 0.69527268, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.7175535, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.69844388961792 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00052226, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.954708934856206, + "language_loss": 0.73414397, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.7565791, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.6248135566711426 + }, + { + "auxiliary_loss_clip": 0.01166792, + "auxiliary_loss_mlp": 0.01109003, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00057006, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.5439558118026344, + "language_loss": 0.68725413, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.71001208, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.582275867462158 + }, + { + "auxiliary_loss_clip": 0.01166932, + "auxiliary_loss_mlp": 0.01110252, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.000579, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9472484716824976, + "language_loss": 0.73697698, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75974882, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.4873080253601074 + }, + { + "auxiliary_loss_clip": 0.01118091, + "auxiliary_loss_mlp": 0.01107881, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00059295, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.755537789878128, + "language_loss": 0.84703743, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86929715, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.613720178604126 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.00747838, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00112414, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 1.8314246229851872, + "language_loss": 0.66243517, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68110979, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.673649787902832 + }, + { + "auxiliary_loss_clip": 0.0115193, + "auxiliary_loss_mlp": 0.01109479, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00056958, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.621514890246922, + "language_loss": 0.8355366, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.8581506, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.546506643295288 + }, + { + "auxiliary_loss_clip": 0.01136651, + "auxiliary_loss_mlp": 0.01109985, + "balance_loss_clip": 1.00190151, + "balance_loss_mlp": 1.00050354, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.458371209500753, + "language_loss": 0.79560268, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.8180691, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.6338179111480713 + }, + { + "auxiliary_loss_clip": 0.01166778, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00059676, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.6655784558141735, + "language_loss": 0.67374867, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69651246, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.577666759490967 + }, + { + "auxiliary_loss_clip": 0.0115188, + "auxiliary_loss_mlp": 0.01110311, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00063896, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.3393290405811147, + "language_loss": 0.7406584, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76328027, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.6009225845336914 + }, + { + "auxiliary_loss_clip": 0.01150102, + "auxiliary_loss_mlp": 0.01108496, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.0005399, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.6037078361620738, + "language_loss": 0.67798996, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70057589, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.68650221824646 + }, + { + "auxiliary_loss_clip": 0.01117654, + "auxiliary_loss_mlp": 0.0110956, + "balance_loss_clip": 1.00192344, + "balance_loss_mlp": 1.0005554, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.8431678307305623, + "language_loss": 0.76669669, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.7889688, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.629535675048828 + }, + { + "auxiliary_loss_clip": 0.01150547, + "auxiliary_loss_mlp": 0.01109484, + "balance_loss_clip": 1.00207186, + "balance_loss_mlp": 1.00047898, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.7608966812530542, + "language_loss": 0.7725696, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79516989, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.570418119430542 + }, + { + "auxiliary_loss_clip": 0.01116499, + "auxiliary_loss_mlp": 0.01109147, + "balance_loss_clip": 1.00190389, + "balance_loss_mlp": 1.00052309, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4038079675212334, + "language_loss": 0.73178005, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75403643, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.6358728408813477 + }, + { + "auxiliary_loss_clip": 0.01150251, + "auxiliary_loss_mlp": 0.01110924, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00058424, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.3629032281687925, + "language_loss": 0.68731487, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70992661, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.511333703994751 + }, + { + "auxiliary_loss_clip": 0.01152214, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_clip": 1.00196397, + "balance_loss_mlp": 1.00059509, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.475506154116167, + "language_loss": 0.79106045, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81368423, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.533447265625 + }, + { + "auxiliary_loss_clip": 0.01149162, + "auxiliary_loss_mlp": 0.01108981, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00045335, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.9835204309666918, + "language_loss": 0.81277007, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83535147, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 2.5041568279266357 + }, + { + "auxiliary_loss_clip": 0.01104313, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_clip": 1.00196064, + "balance_loss_mlp": 1.00058866, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.9607660432580603, + "language_loss": 0.62170494, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64385927, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.6400084495544434 + }, + { + "auxiliary_loss_clip": 0.01166731, + "auxiliary_loss_mlp": 0.01109465, + "balance_loss_clip": 1.0020628, + "balance_loss_mlp": 1.00045991, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.559468542715019, + "language_loss": 0.78790843, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81067038, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 4.05425763130188 + }, + { + "auxiliary_loss_clip": 0.0109992, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00059617, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.262525957891128, + "language_loss": 0.76013774, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78224146, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.7350547313690186 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.01109389, + "balance_loss_clip": 1.00170672, + "balance_loss_mlp": 1.00057447, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 3.8831283274002533, + "language_loss": 0.65721935, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67947698, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.603020191192627 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01109623, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00052238, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.579574676846991, + "language_loss": 0.74612129, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76873541, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.582062005996704 + }, + { + "auxiliary_loss_clip": 0.01166832, + "auxiliary_loss_mlp": 0.01110214, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00044572, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.8536951449677692, + "language_loss": 0.77021325, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79298365, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.5099902153015137 + }, + { + "auxiliary_loss_clip": 0.01136224, + "auxiliary_loss_mlp": 0.01109856, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00066078, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.189723652455143, + "language_loss": 0.75102741, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.77348816, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.6459217071533203 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01110603, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.0005486, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.7105398546853416, + "language_loss": 0.68742228, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70972717, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 4.044549465179443 + }, + { + "auxiliary_loss_clip": 0.01167034, + "auxiliary_loss_mlp": 0.01111405, + "balance_loss_clip": 1.00211465, + "balance_loss_mlp": 1.00058794, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.0741001770833463, + "language_loss": 0.81830752, + "learning_rate": 1.456420997543594e-06, + "loss": 0.84109193, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.485180377960205 + }, + { + "auxiliary_loss_clip": 0.01166524, + "auxiliary_loss_mlp": 0.01108736, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00058937, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.676332023694869, + "language_loss": 0.69722402, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71997666, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.4740095138549805 + }, + { + "auxiliary_loss_clip": 0.01152196, + "auxiliary_loss_mlp": 0.01110785, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00054073, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 14.004997890364304, + "language_loss": 0.68011272, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70274258, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 3.895124673843384 + }, + { + "auxiliary_loss_clip": 0.01150049, + "auxiliary_loss_mlp": 0.01109324, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00060487, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 2.201466183104659, + "language_loss": 0.78487706, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.8074708, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 2.5724713802337646 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01109996, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00060987, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4745491481257416, + "language_loss": 0.7276088, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.74974614, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.6910533905029297 + }, + { + "auxiliary_loss_clip": 0.01118638, + "auxiliary_loss_mlp": 0.01110631, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.0006721, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 1.9120686907702473, + "language_loss": 0.77978146, + "learning_rate": 1.454547250154447e-06, + "loss": 0.80207419, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.642153024673462 + }, + { + "auxiliary_loss_clip": 0.01150441, + "auxiliary_loss_mlp": 0.01110078, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00050128, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.7725081871974184, + "language_loss": 0.83276528, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85537052, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 3.986706495285034 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01110084, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00060225, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.8312559353828495, + "language_loss": 0.71406358, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73666513, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.600457191467285 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.00747836, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.00106096, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.384377280151791, + "language_loss": 0.71740294, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73655081, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.5323405265808105 + }, + { + "auxiliary_loss_clip": 0.01133582, + "auxiliary_loss_mlp": 0.01109343, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00052857, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.8178468759599624, + "language_loss": 0.84495497, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.8673842, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.560920476913452 + }, + { + "auxiliary_loss_clip": 0.01152066, + "auxiliary_loss_mlp": 0.01109788, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.00059223, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6082901991292677, + "language_loss": 0.65555149, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67817008, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.52361798286438 + }, + { + "auxiliary_loss_clip": 0.01149936, + "auxiliary_loss_mlp": 0.01109612, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00070286, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.4702383621532082, + "language_loss": 0.80601203, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82860756, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.5376133918762207 + }, + { + "auxiliary_loss_clip": 0.01117264, + "auxiliary_loss_mlp": 0.01110508, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00054896, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.8664861939862691, + "language_loss": 0.82586205, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84813976, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.6384592056274414 + }, + { + "auxiliary_loss_clip": 0.01102587, + "auxiliary_loss_mlp": 0.01110115, + "balance_loss_clip": 1.00159037, + "balance_loss_mlp": 1.0004425, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.8538025319186349, + "language_loss": 0.82896519, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.85109222, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.6132028102874756 + }, + { + "auxiliary_loss_clip": 0.01133212, + "auxiliary_loss_mlp": 0.00747918, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00121212, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 1.8754906650819338, + "language_loss": 0.66080642, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.67961776, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.5583081245422363 + }, + { + "auxiliary_loss_clip": 0.01118649, + "auxiliary_loss_mlp": 0.01109741, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00064111, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.2632172809979956, + "language_loss": 0.81192684, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83421075, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.5813727378845215 + }, + { + "auxiliary_loss_clip": 0.0112003, + "auxiliary_loss_mlp": 0.01108136, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00056148, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 2.1445727320185406, + "language_loss": 0.72635281, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74863446, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 2.615821361541748 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01109497, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00039649, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.6093957584644252, + "language_loss": 0.80510902, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.82755411, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.592283248901367 + }, + { + "auxiliary_loss_clip": 0.01088741, + "auxiliary_loss_mlp": 0.01110002, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00080645, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 1.8416348743262538, + "language_loss": 0.78256881, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80455625, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.700819492340088 + }, + { + "auxiliary_loss_clip": 0.01150107, + "auxiliary_loss_mlp": 0.01109903, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00051653, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 2.3873581768913503, + "language_loss": 0.72938371, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75198388, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.5224697589874268 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01109655, + "balance_loss_clip": 1.0020138, + "balance_loss_mlp": 1.00065017, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.493746041695412, + "language_loss": 0.72143745, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74388778, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.6320862770080566 + }, + { + "auxiliary_loss_clip": 0.01102077, + "auxiliary_loss_mlp": 0.01110956, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00071108, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.5405137820090893, + "language_loss": 0.78116977, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80330008, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.6748604774475098 + }, + { + "auxiliary_loss_clip": 0.01167028, + "auxiliary_loss_mlp": 0.01110524, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00056589, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.74679012724371, + "language_loss": 0.77432537, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.7971009, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.4636173248291016 + }, + { + "auxiliary_loss_clip": 0.01150165, + "auxiliary_loss_mlp": 0.01110235, + "balance_loss_clip": 1.00199556, + "balance_loss_mlp": 1.00046682, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.5854656018364828, + "language_loss": 0.58524787, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60785186, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.652631998062134 + }, + { + "auxiliary_loss_clip": 0.01134808, + "auxiliary_loss_mlp": 0.01110024, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00054264, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.503016316671528, + "language_loss": 0.7795651, + "learning_rate": 1.447431741055314e-06, + "loss": 0.80201346, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.6128368377685547 + }, + { + "auxiliary_loss_clip": 0.011667, + "auxiliary_loss_mlp": 0.01110642, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00068378, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.0417136825201956, + "language_loss": 0.77203494, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79480839, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.503139019012451 + }, + { + "auxiliary_loss_clip": 0.01150133, + "auxiliary_loss_mlp": 0.01109874, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00067854, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.5855272924447799, + "language_loss": 0.72354811, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74614823, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.5610859394073486 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01109742, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00054586, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.1547880003655417, + "language_loss": 0.74843782, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77120161, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 2.491551160812378 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01109882, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00059092, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.0292653950994977, + "language_loss": 0.73717403, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75960505, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.5515668392181396 + }, + { + "auxiliary_loss_clip": 0.01134792, + "auxiliary_loss_mlp": 0.01109666, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00047016, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.6892849438299695, + "language_loss": 0.70326424, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72570872, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 2.5629453659057617 + }, + { + "auxiliary_loss_clip": 0.01151943, + "auxiliary_loss_mlp": 0.01109788, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00049722, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.4513606292419654, + "language_loss": 0.76349622, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78611356, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 3.9440314769744873 + }, + { + "auxiliary_loss_clip": 0.01134549, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.00182426, + "balance_loss_mlp": 1.00105572, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 3.729121730474724, + "language_loss": 0.74884969, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76767206, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.607940196990967 + }, + { + "auxiliary_loss_clip": 0.01148006, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_clip": 1.00143933, + "balance_loss_mlp": 1.00014579, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.7979545254158049, + "language_loss": 0.55098224, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57334495, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.222045421600342 + }, + { + "auxiliary_loss_clip": 0.01151469, + "auxiliary_loss_mlp": 0.01109905, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00061369, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.369953030683887, + "language_loss": 0.62263948, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64525324, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.6719439029693604 + }, + { + "auxiliary_loss_clip": 0.01118757, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.0004617, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6815570030072546, + "language_loss": 0.75171125, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.7739982, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.618431806564331 + }, + { + "auxiliary_loss_clip": 0.01166649, + "auxiliary_loss_mlp": 0.01108631, + "balance_loss_clip": 1.00206685, + "balance_loss_mlp": 1.0006752, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.5716020495635248, + "language_loss": 0.81966203, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.84241486, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.5847926139831543 + }, + { + "auxiliary_loss_clip": 0.01135047, + "auxiliary_loss_mlp": 0.01108051, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00038075, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.3803409391229475, + "language_loss": 0.72405767, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74648869, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.6300554275512695 + }, + { + "auxiliary_loss_clip": 0.01130946, + "auxiliary_loss_mlp": 0.01088987, + "balance_loss_clip": 1.00137758, + "balance_loss_mlp": 1.00048602, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8145028984298449, + "language_loss": 0.54834276, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.57054204, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 4.427683353424072 + }, + { + "auxiliary_loss_clip": 0.01133633, + "auxiliary_loss_mlp": 0.01108968, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00063014, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.8666702056562376, + "language_loss": 0.8272168, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.84964275, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 2.5679128170013428 + }, + { + "auxiliary_loss_clip": 0.01134542, + "auxiliary_loss_mlp": 0.01109391, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00057697, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7036338653115566, + "language_loss": 0.83650792, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85894722, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.6417183876037598 + }, + { + "auxiliary_loss_clip": 0.01135324, + "auxiliary_loss_mlp": 0.01110489, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00062561, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 2.9797895028661348, + "language_loss": 0.78459179, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80704993, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 4.020474672317505 + }, + { + "auxiliary_loss_clip": 0.01116269, + "auxiliary_loss_mlp": 0.00747656, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00092101, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.3512656917089916, + "language_loss": 0.73785281, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75649202, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.7145512104034424 + }, + { + "auxiliary_loss_clip": 0.01135174, + "auxiliary_loss_mlp": 0.01109695, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00059462, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.4494019183320443, + "language_loss": 0.63965321, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66210186, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 2.582130193710327 + }, + { + "auxiliary_loss_clip": 0.01149519, + "auxiliary_loss_mlp": 0.0111034, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00047624, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.5194645079434708, + "language_loss": 0.80247545, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82507402, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 3.9962809085845947 + }, + { + "auxiliary_loss_clip": 0.01149773, + "auxiliary_loss_mlp": 0.01111158, + "balance_loss_clip": 1.00224864, + "balance_loss_mlp": 1.00053132, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.4565443512243526, + "language_loss": 0.67045927, + "learning_rate": 1.439949905155693e-06, + "loss": 0.69306856, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.643526315689087 + }, + { + "auxiliary_loss_clip": 0.01151782, + "auxiliary_loss_mlp": 0.01109662, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00056195, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 2.1464891010728606, + "language_loss": 0.74009109, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.7627055, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.7123045921325684 + }, + { + "auxiliary_loss_clip": 0.01149982, + "auxiliary_loss_mlp": 0.01109705, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00060499, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6955172960478289, + "language_loss": 0.72784841, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75044525, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.6491687297821045 + }, + { + "auxiliary_loss_clip": 0.01166877, + "auxiliary_loss_mlp": 0.01111667, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.0006597, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.106943256816157, + "language_loss": 0.67454565, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69733113, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.576805830001831 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01108565, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00051355, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.7818909692639857, + "language_loss": 0.80066061, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82341158, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.507791757583618 + }, + { + "auxiliary_loss_clip": 0.01120492, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_clip": 1.00197589, + "balance_loss_mlp": 1.0005486, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 1.8656223749740741, + "language_loss": 0.70815408, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73046505, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.651667833328247 + }, + { + "auxiliary_loss_clip": 0.01103709, + "auxiliary_loss_mlp": 0.01109838, + "balance_loss_clip": 1.00167537, + "balance_loss_mlp": 1.00064206, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.6952693921060926, + "language_loss": 0.84208667, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86422217, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.694230079650879 + }, + { + "auxiliary_loss_clip": 0.01134884, + "auxiliary_loss_mlp": 0.01109002, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00056911, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.770176156316514, + "language_loss": 0.80268866, + "learning_rate": 1.437333263694373e-06, + "loss": 0.82512748, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.5666143894195557 + }, + { + "auxiliary_loss_clip": 0.01086618, + "auxiliary_loss_mlp": 0.01110032, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00054979, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.542122426858387, + "language_loss": 0.70851779, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73048425, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.761711597442627 + }, + { + "auxiliary_loss_clip": 0.0110157, + "auxiliary_loss_mlp": 0.01110235, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.00046706, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.591368681645477, + "language_loss": 0.73121637, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75333446, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.876857042312622 + }, + { + "auxiliary_loss_clip": 0.01135643, + "auxiliary_loss_mlp": 0.01110191, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00051904, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.6130431250629444, + "language_loss": 0.68298066, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70543903, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.562041997909546 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.011088, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00065386, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 1.7288931814789243, + "language_loss": 0.76154631, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78398883, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 2.5619826316833496 + }, + { + "auxiliary_loss_clip": 0.01133285, + "auxiliary_loss_mlp": 0.01111226, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00079024, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 1.7295389146094395, + "language_loss": 0.74285626, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76530141, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 2.6209075450897217 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01109554, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00045347, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.5730698219725812, + "language_loss": 0.86393875, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88638777, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 2.5691893100738525 + }, + { + "auxiliary_loss_clip": 0.01116591, + "auxiliary_loss_mlp": 0.0111068, + "balance_loss_clip": 1.00192583, + "balance_loss_mlp": 1.00062573, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.0895079146462017, + "language_loss": 0.70326847, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72554123, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.65716552734375 + }, + { + "auxiliary_loss_clip": 0.0115172, + "auxiliary_loss_mlp": 0.01110156, + "balance_loss_clip": 1.0020957, + "balance_loss_mlp": 1.00048339, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.6549786069982912, + "language_loss": 0.85031706, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87293583, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.6257925033569336 + }, + { + "auxiliary_loss_clip": 0.01136283, + "auxiliary_loss_mlp": 0.01110033, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00055134, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 2.4436646592928244, + "language_loss": 0.76215708, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78462017, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.5717921257019043 + }, + { + "auxiliary_loss_clip": 0.01149778, + "auxiliary_loss_mlp": 0.01109268, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00064421, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.6040779915463075, + "language_loss": 0.71187037, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73446083, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.5721757411956787 + }, + { + "auxiliary_loss_clip": 0.01150046, + "auxiliary_loss_mlp": 0.01111368, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00055075, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 1.993573182457186, + "language_loss": 0.78131998, + "learning_rate": 1.433223512712475e-06, + "loss": 0.8039341, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.5329713821411133 + }, + { + "auxiliary_loss_clip": 0.01134267, + "auxiliary_loss_mlp": 0.01109378, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00046873, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.7497659443288573, + "language_loss": 0.75386173, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77629817, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.5604186058044434 + }, + { + "auxiliary_loss_clip": 0.01102685, + "auxiliary_loss_mlp": 0.01110222, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00054932, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.8202982570843185, + "language_loss": 0.84772909, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86985815, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.6406233310699463 + }, + { + "auxiliary_loss_clip": 0.01118539, + "auxiliary_loss_mlp": 0.01111396, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.00067437, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.9008103250177737, + "language_loss": 0.70089298, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72319239, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 2.6462841033935547 + }, + { + "auxiliary_loss_clip": 0.0115002, + "auxiliary_loss_mlp": 0.01111008, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00047755, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.604725053274784, + "language_loss": 0.78402352, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80663383, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.613934278488159 + }, + { + "auxiliary_loss_clip": 0.01088014, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_clip": 1.00166476, + "balance_loss_mlp": 1.00060356, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.8825481784776104, + "language_loss": 0.77040875, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79238689, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 4.133360862731934 + }, + { + "auxiliary_loss_clip": 0.01105029, + "auxiliary_loss_mlp": 0.01109507, + "balance_loss_clip": 1.00177491, + "balance_loss_mlp": 1.00069249, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5106114396472037, + "language_loss": 0.87104857, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89319396, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.677934169769287 + }, + { + "auxiliary_loss_clip": 0.01151933, + "auxiliary_loss_mlp": 0.01109061, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00053334, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5515937908917994, + "language_loss": 0.75746328, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.78007323, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.6223764419555664 + }, + { + "auxiliary_loss_clip": 0.01149501, + "auxiliary_loss_mlp": 0.01111581, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00057316, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 1.9334730516656213, + "language_loss": 0.65778887, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68039966, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.61647629737854 + }, + { + "auxiliary_loss_clip": 0.01136321, + "auxiliary_loss_mlp": 0.01110638, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00058436, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.6887191921788145, + "language_loss": 0.66715163, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68962127, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.583894968032837 + }, + { + "auxiliary_loss_clip": 0.01118361, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00060189, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.0473800226810788, + "language_loss": 0.69868958, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.72097784, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.7218992710113525 + }, + { + "auxiliary_loss_clip": 0.01149273, + "auxiliary_loss_mlp": 0.01110081, + "balance_loss_clip": 1.0016923, + "balance_loss_mlp": 1.00050378, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.8364416025232642, + "language_loss": 0.64355886, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66615236, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.561408281326294 + }, + { + "auxiliary_loss_clip": 0.01133344, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.0006969, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 2.0154162290059863, + "language_loss": 0.68724436, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.70968246, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.624495267868042 + }, + { + "auxiliary_loss_clip": 0.0113139, + "auxiliary_loss_mlp": 0.01088629, + "balance_loss_clip": 1.00147772, + "balance_loss_mlp": 1.00012767, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.9251002745469071, + "language_loss": 0.60394466, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62614489, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 4.667344808578491 + }, + { + "auxiliary_loss_clip": 0.0108493, + "auxiliary_loss_mlp": 0.01108619, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00056791, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.544767731814332, + "language_loss": 0.85597539, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87791091, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.9410998821258545 + }, + { + "auxiliary_loss_clip": 0.01136372, + "auxiliary_loss_mlp": 0.0111063, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00067151, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.099412542293542, + "language_loss": 0.73544228, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75791228, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 4.003470182418823 + }, + { + "auxiliary_loss_clip": 0.01120346, + "auxiliary_loss_mlp": 0.01110805, + "balance_loss_clip": 1.002141, + "balance_loss_mlp": 1.00065589, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 6.166384993345702, + "language_loss": 0.80189764, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82420915, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.7156941890716553 + }, + { + "auxiliary_loss_clip": 0.01166765, + "auxiliary_loss_mlp": 0.00747696, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.0011183, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.1162960254423684, + "language_loss": 0.75240505, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77154964, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.49635648727417 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.01109335, + "balance_loss_clip": 1.00193655, + "balance_loss_mlp": 1.00061572, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 2.0876271795374186, + "language_loss": 0.71184123, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73443371, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.5752146244049072 + }, + { + "auxiliary_loss_clip": 0.01133311, + "auxiliary_loss_mlp": 0.01109913, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00052667, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.4719950148481318, + "language_loss": 0.76195323, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.7843855, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 4.0316596031188965 + }, + { + "auxiliary_loss_clip": 0.01151428, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00047398, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 1.9063579176435335, + "language_loss": 0.7334764, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75608933, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.567168951034546 + }, + { + "auxiliary_loss_clip": 0.01105094, + "auxiliary_loss_mlp": 0.00747841, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.0011003, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.6171819690167997, + "language_loss": 0.67513192, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69366127, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.6712045669555664 + }, + { + "auxiliary_loss_clip": 0.01152048, + "auxiliary_loss_mlp": 0.01109524, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.0007093, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.848033075540404, + "language_loss": 0.71437508, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73699081, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.5220258235931396 + }, + { + "auxiliary_loss_clip": 0.01166676, + "auxiliary_loss_mlp": 0.01109628, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00062311, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.5962502520658786, + "language_loss": 0.84732825, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87009132, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.48405385017395 + }, + { + "auxiliary_loss_clip": 0.01151051, + "auxiliary_loss_mlp": 0.01110309, + "balance_loss_clip": 1.00191259, + "balance_loss_mlp": 1.0006361, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.071072725006634, + "language_loss": 0.79454476, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81715834, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.5078275203704834 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01111933, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00054419, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.9532364519404213, + "language_loss": 0.78570747, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80786359, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.631488561630249 + }, + { + "auxiliary_loss_clip": 0.0108754, + "auxiliary_loss_mlp": 0.01109927, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00063634, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.4761213176095491, + "language_loss": 0.73332238, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75529712, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.724848508834839 + }, + { + "auxiliary_loss_clip": 0.01133311, + "auxiliary_loss_mlp": 0.00747622, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00099754, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.40136481602964, + "language_loss": 0.6901592, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70896852, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.6068778038024902 + }, + { + "auxiliary_loss_clip": 0.0115031, + "auxiliary_loss_mlp": 0.01110307, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00044405, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 1.8541842653778913, + "language_loss": 0.87131858, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89392483, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.527263641357422 + }, + { + "auxiliary_loss_clip": 0.01117826, + "auxiliary_loss_mlp": 0.01110068, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00058591, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.4735602416136295, + "language_loss": 0.83109343, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85337234, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.680114269256592 + }, + { + "auxiliary_loss_clip": 0.01137154, + "auxiliary_loss_mlp": 0.01110156, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00057936, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.495650551781667, + "language_loss": 0.86219025, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88466334, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.5618155002593994 + }, + { + "auxiliary_loss_clip": 0.01150046, + "auxiliary_loss_mlp": 0.01110749, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00069475, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 2.657803682187221, + "language_loss": 0.7708354, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79344332, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 2.62015700340271 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01109961, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00038397, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.5544726676400293, + "language_loss": 0.74363792, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76609945, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.6185269355773926 + }, + { + "auxiliary_loss_clip": 0.01102447, + "auxiliary_loss_mlp": 0.0108828, + "balance_loss_clip": 1.00161076, + "balance_loss_mlp": 1.0001601, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.751323790524583, + "language_loss": 0.55242801, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57433522, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.5191540718078613 + }, + { + "auxiliary_loss_clip": 0.01118239, + "auxiliary_loss_mlp": 0.0110996, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00057387, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.826294863124651, + "language_loss": 0.82105756, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84333956, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.8037962913513184 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01110012, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00043535, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.6326627206885105, + "language_loss": 0.77975571, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80237067, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.597579002380371 + }, + { + "auxiliary_loss_clip": 0.01152111, + "auxiliary_loss_mlp": 0.01109737, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00054121, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.5601010928517927, + "language_loss": 0.72215915, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74477762, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.5641210079193115 + }, + { + "auxiliary_loss_clip": 0.01166673, + "auxiliary_loss_mlp": 0.01110677, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00052786, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 5.258423073158754, + "language_loss": 0.55118263, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57395613, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.4949758052825928 + }, + { + "auxiliary_loss_clip": 0.0110285, + "auxiliary_loss_mlp": 0.01109699, + "balance_loss_clip": 1.00165868, + "balance_loss_mlp": 1.00050306, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.4388782228941623, + "language_loss": 0.70261317, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72473866, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.710498809814453 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01110371, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.00069857, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.6608420630213712, + "language_loss": 0.62028074, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64271653, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.579850673675537 + }, + { + "auxiliary_loss_clip": 0.01135362, + "auxiliary_loss_mlp": 0.01109494, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00058424, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.6807741461740127, + "language_loss": 0.70754695, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.72999549, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.5800251960754395 + }, + { + "auxiliary_loss_clip": 0.01134687, + "auxiliary_loss_mlp": 0.01109779, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00058305, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.585558911774718, + "language_loss": 0.69189811, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71434277, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.648188352584839 + }, + { + "auxiliary_loss_clip": 0.01166793, + "auxiliary_loss_mlp": 0.01109751, + "balance_loss_clip": 1.00206923, + "balance_loss_mlp": 1.00065041, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 2.525027180235893, + "language_loss": 0.65669751, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67946297, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 2.536907196044922 + }, + { + "auxiliary_loss_clip": 0.01149898, + "auxiliary_loss_mlp": 0.01109949, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00046754, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 1.916631200316663, + "language_loss": 0.74520469, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76780319, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 4.142187118530273 + }, + { + "auxiliary_loss_clip": 0.01134916, + "auxiliary_loss_mlp": 0.01110022, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00063562, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.359120985544409, + "language_loss": 0.72837591, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.75082529, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.5899245738983154 + }, + { + "auxiliary_loss_clip": 0.01166719, + "auxiliary_loss_mlp": 0.0110989, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00050402, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.037049992888123, + "language_loss": 0.76151478, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.7842809, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.5113399028778076 + }, + { + "auxiliary_loss_clip": 0.01135611, + "auxiliary_loss_mlp": 0.01109152, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00052857, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.3785116583072787, + "language_loss": 0.72543609, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.7478838, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 2.593287944793701 + }, + { + "auxiliary_loss_clip": 0.01149855, + "auxiliary_loss_mlp": 0.01108541, + "balance_loss_clip": 1.00187933, + "balance_loss_mlp": 1.00077558, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.5202341331274247, + "language_loss": 0.83742332, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86000729, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.55935001373291 + }, + { + "auxiliary_loss_clip": 0.0110357, + "auxiliary_loss_mlp": 0.00747646, + "balance_loss_clip": 1.0016892, + "balance_loss_mlp": 1.00091457, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.6178219364493973, + "language_loss": 0.71207154, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73058367, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.6702311038970947 + }, + { + "auxiliary_loss_clip": 0.01149908, + "auxiliary_loss_mlp": 0.0110958, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00067043, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.7612757422991272, + "language_loss": 0.82604039, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.8486352, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.518125534057617 + }, + { + "auxiliary_loss_clip": 0.01116893, + "auxiliary_loss_mlp": 0.01110839, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00068951, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.3650855699313764, + "language_loss": 0.76227117, + "learning_rate": 1.4145758826341e-06, + "loss": 0.78454852, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 3.9562666416168213 + }, + { + "auxiliary_loss_clip": 0.01166671, + "auxiliary_loss_mlp": 0.01109424, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00051427, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.4751303914265588, + "language_loss": 0.7953499, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81811082, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.5089917182922363 + }, + { + "auxiliary_loss_clip": 0.01133249, + "auxiliary_loss_mlp": 0.01109485, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00057495, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 2.136623336360464, + "language_loss": 0.76397336, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78640068, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 3.9640471935272217 + }, + { + "auxiliary_loss_clip": 0.01134805, + "auxiliary_loss_mlp": 0.0110857, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00051832, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.7743715699815503, + "language_loss": 0.87376392, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89619762, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.583921194076538 + }, + { + "auxiliary_loss_clip": 0.01151867, + "auxiliary_loss_mlp": 0.01109921, + "balance_loss_clip": 1.00197446, + "balance_loss_mlp": 1.00043869, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.5230175493996487, + "language_loss": 0.72207278, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74469066, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.5095207691192627 + }, + { + "auxiliary_loss_clip": 0.01135108, + "auxiliary_loss_mlp": 0.01109001, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00047255, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.7102335071291115, + "language_loss": 0.76555234, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78799343, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 2.5730297565460205 + }, + { + "auxiliary_loss_clip": 0.01166703, + "auxiliary_loss_mlp": 0.01110141, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00065935, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 2.1889058915242026, + "language_loss": 0.79647499, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.81924343, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.4813811779022217 + }, + { + "auxiliary_loss_clip": 0.01135319, + "auxiliary_loss_mlp": 0.01109148, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00052476, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4498742160143367, + "language_loss": 0.67215621, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69460088, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 3.9799203872680664 + }, + { + "auxiliary_loss_clip": 0.0116659, + "auxiliary_loss_mlp": 0.01109389, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00057483, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.7782093423174317, + "language_loss": 0.80285692, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82561672, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.486828327178955 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01110245, + "balance_loss_clip": 1.001647, + "balance_loss_mlp": 1.00066745, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 1.7972269749239163, + "language_loss": 0.70817351, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.73044008, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 2.621384382247925 + }, + { + "auxiliary_loss_clip": 0.01118589, + "auxiliary_loss_mlp": 0.01109721, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.8819588196951424, + "language_loss": 0.70957971, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.73186278, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.5881588459014893 + }, + { + "auxiliary_loss_clip": 0.01135398, + "auxiliary_loss_mlp": 0.01108505, + "balance_loss_clip": 1.0016644, + "balance_loss_mlp": 1.00045335, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.7659788518996906, + "language_loss": 0.69388074, + "learning_rate": 1.410480790256154e-06, + "loss": 0.7163198, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.662172794342041 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01110071, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.00058913, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.88754811428026, + "language_loss": 0.73617786, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75894666, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.5197136402130127 + }, + { + "auxiliary_loss_clip": 0.01116092, + "auxiliary_loss_mlp": 0.01110402, + "balance_loss_clip": 1.00205696, + "balance_loss_mlp": 1.00063443, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6506077345646333, + "language_loss": 0.76813978, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.79040468, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.6401748657226562 + }, + { + "auxiliary_loss_clip": 0.01115249, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_clip": 1.0012722, + "balance_loss_mlp": 1.00010252, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7073013147888182, + "language_loss": 0.56008643, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58212119, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.2263834476470947 + }, + { + "auxiliary_loss_clip": 0.01145953, + "auxiliary_loss_mlp": 0.01088452, + "balance_loss_clip": 1.00145698, + "balance_loss_mlp": 1.00033212, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7711488571959195, + "language_loss": 0.56827313, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.59061718, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.068723201751709 + }, + { + "auxiliary_loss_clip": 0.01103697, + "auxiliary_loss_mlp": 0.01108741, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00068951, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.4127283749178503, + "language_loss": 0.68605578, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70818013, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.71512770652771 + }, + { + "auxiliary_loss_clip": 0.01151914, + "auxiliary_loss_mlp": 0.01109859, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00056779, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.664330063819775, + "language_loss": 0.80882376, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83144152, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.510038137435913 + }, + { + "auxiliary_loss_clip": 0.011356, + "auxiliary_loss_mlp": 0.0110989, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00069451, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.53174994747133, + "language_loss": 0.71245915, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73491406, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.719162702560425 + }, + { + "auxiliary_loss_clip": 0.011325, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.00177515, + "balance_loss_mlp": 1.00058532, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.5707145851979805, + "language_loss": 0.80461097, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82700521, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.583129405975342 + }, + { + "auxiliary_loss_clip": 0.01135375, + "auxiliary_loss_mlp": 0.01109417, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00050735, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.1672443246489848, + "language_loss": 0.71035802, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.73280597, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 2.5892434120178223 + }, + { + "auxiliary_loss_clip": 0.01117865, + "auxiliary_loss_mlp": 0.01110188, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00051546, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.849132702888211, + "language_loss": 0.65260684, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67488742, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.6162381172180176 + }, + { + "auxiliary_loss_clip": 0.0114798, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_clip": 1.00146258, + "balance_loss_mlp": 1.00032318, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.629791130398072, + "language_loss": 0.49597821, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51834244, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.1572539806365967 + }, + { + "auxiliary_loss_clip": 0.01147991, + "auxiliary_loss_mlp": 0.0108793, + "balance_loss_clip": 1.00152826, + "balance_loss_mlp": 1.00019157, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8565103940792088, + "language_loss": 0.57006061, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59241986, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.011930227279663 + }, + { + "auxiliary_loss_clip": 0.01166893, + "auxiliary_loss_mlp": 0.01109356, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00063753, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.7188586319532704, + "language_loss": 0.69743752, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72019994, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.500281572341919 + }, + { + "auxiliary_loss_clip": 0.01120316, + "auxiliary_loss_mlp": 0.01108857, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00061488, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 1.5477233192702202, + "language_loss": 0.72329313, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74558485, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.6294989585876465 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.01110482, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.0006187, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.692858168826941, + "language_loss": 0.53696877, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55943084, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.7194931507110596 + }, + { + "auxiliary_loss_clip": 0.01134241, + "auxiliary_loss_mlp": 0.01108834, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.00040102, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.7655900667051536, + "language_loss": 0.7010268, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72345757, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.5831243991851807 + }, + { + "auxiliary_loss_clip": 0.01086145, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_clip": 1.00188649, + "balance_loss_mlp": 1.00062847, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.6950699144386436, + "language_loss": 0.74908692, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.77103418, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.705206871032715 + }, + { + "auxiliary_loss_clip": 0.01149953, + "auxiliary_loss_mlp": 0.01109376, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00056207, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 2.1126250319462905, + "language_loss": 0.67893887, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.70153213, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.5419723987579346 + }, + { + "auxiliary_loss_clip": 0.01149963, + "auxiliary_loss_mlp": 0.01110146, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.00056887, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.9278754576989292, + "language_loss": 0.74310839, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.7657094, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 4.053394556045532 + }, + { + "auxiliary_loss_clip": 0.01151626, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00053573, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.9439045959709234, + "language_loss": 0.80497622, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82757831, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.5300421714782715 + }, + { + "auxiliary_loss_clip": 0.01152221, + "auxiliary_loss_mlp": 0.01109745, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00074017, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.5489605158047033, + "language_loss": 0.55626279, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57888246, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.6663918495178223 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01108846, + "balance_loss_clip": 1.0020231, + "balance_loss_mlp": 1.00069928, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.7242177768817115, + "language_loss": 0.74027854, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76287693, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.5397181510925293 + }, + { + "auxiliary_loss_clip": 0.01136934, + "auxiliary_loss_mlp": 0.01109297, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00067341, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 1.9680172512554657, + "language_loss": 0.65445179, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67691416, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 2.579672336578369 + }, + { + "auxiliary_loss_clip": 0.01166651, + "auxiliary_loss_mlp": 0.01108613, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00065732, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 1.5623873695826658, + "language_loss": 0.75966424, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78241682, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.534575939178467 + }, + { + "auxiliary_loss_clip": 0.01116798, + "auxiliary_loss_mlp": 0.01110103, + "balance_loss_clip": 1.00184619, + "balance_loss_mlp": 1.0006206, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.497799018518089, + "language_loss": 0.71249831, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73476732, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 2.6367814540863037 + }, + { + "auxiliary_loss_clip": 0.01166801, + "auxiliary_loss_mlp": 0.01111102, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.000476, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.7051542110216795, + "language_loss": 0.73000258, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75278163, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.5432934761047363 + }, + { + "auxiliary_loss_clip": 0.0110077, + "auxiliary_loss_mlp": 0.01109373, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00065374, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.1116559238699857, + "language_loss": 0.73173839, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75383979, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 4.123414993286133 + }, + { + "auxiliary_loss_clip": 0.01166656, + "auxiliary_loss_mlp": 0.01109793, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00059772, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.3573652447280005, + "language_loss": 0.65585792, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67862236, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.6718807220458984 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.0005089, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.5142519011714173, + "language_loss": 0.76710176, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.78951752, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.565964698791504 + }, + { + "auxiliary_loss_clip": 0.01118263, + "auxiliary_loss_mlp": 0.01107932, + "balance_loss_clip": 1.00170147, + "balance_loss_mlp": 1.0005486, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.7369278252262246, + "language_loss": 0.77152216, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79378408, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 3.976893186569214 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01108127, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00055289, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.819650496989593, + "language_loss": 0.75660467, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77935082, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.497058391571045 + }, + { + "auxiliary_loss_clip": 0.01149907, + "auxiliary_loss_mlp": 0.01109142, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00061417, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.760529856261283, + "language_loss": 0.638565, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66115552, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.5823919773101807 + }, + { + "auxiliary_loss_clip": 0.01133108, + "auxiliary_loss_mlp": 0.01107839, + "balance_loss_clip": 1.00191987, + "balance_loss_mlp": 1.00045562, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 2.1512075465382052, + "language_loss": 0.78575218, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80816162, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 3.9817748069763184 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01109299, + "balance_loss_clip": 1.00190222, + "balance_loss_mlp": 1.00048494, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.930644826601101, + "language_loss": 0.71964294, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74206686, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.600687026977539 + }, + { + "auxiliary_loss_clip": 0.01166752, + "auxiliary_loss_mlp": 0.01109726, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00062609, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.6904813781787844, + "language_loss": 0.74707705, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76984179, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.6163806915283203 + }, + { + "auxiliary_loss_clip": 0.01151569, + "auxiliary_loss_mlp": 0.01109554, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00073957, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.6617973057480009, + "language_loss": 0.79823482, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82084608, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 2.5598158836364746 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01108979, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00064206, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.4220918557135906, + "language_loss": 0.81026626, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83268696, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 2.546776533126831 + }, + { + "auxiliary_loss_clip": 0.0111771, + "auxiliary_loss_mlp": 0.01110187, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00070477, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.3533633651632053, + "language_loss": 0.83330083, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85557985, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.551177740097046 + }, + { + "auxiliary_loss_clip": 0.01149771, + "auxiliary_loss_mlp": 0.01109595, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00068533, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.2974942445972646, + "language_loss": 0.75525117, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77784485, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.573495864868164 + }, + { + "auxiliary_loss_clip": 0.011346, + "auxiliary_loss_mlp": 0.01108974, + "balance_loss_clip": 1.00174081, + "balance_loss_mlp": 1.0004462, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.8766547250330947, + "language_loss": 0.76607931, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78851509, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.543133020401001 + }, + { + "auxiliary_loss_clip": 0.01166585, + "auxiliary_loss_mlp": 0.0110912, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00049663, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.6369072179059245, + "language_loss": 0.7668575, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78961456, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.5424489974975586 + }, + { + "auxiliary_loss_clip": 0.01151558, + "auxiliary_loss_mlp": 0.01109395, + "balance_loss_clip": 1.0019263, + "balance_loss_mlp": 1.00058031, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.6875692313462636, + "language_loss": 0.75166035, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77426982, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.506345510482788 + }, + { + "auxiliary_loss_clip": 0.01135083, + "auxiliary_loss_mlp": 0.01110442, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00048351, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 1.9734451235515393, + "language_loss": 0.72850728, + "learning_rate": 1.394498830235383e-06, + "loss": 0.7509625, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.7754011154174805 + }, + { + "auxiliary_loss_clip": 0.01137005, + "auxiliary_loss_mlp": 0.01109167, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00063896, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 2.131811061235246, + "language_loss": 0.69426709, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71672887, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.578167200088501 + }, + { + "auxiliary_loss_clip": 0.01103358, + "auxiliary_loss_mlp": 0.00747531, + "balance_loss_clip": 1.00193441, + "balance_loss_mlp": 1.00105214, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 2.4186062203332135, + "language_loss": 0.76973081, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78823966, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.6843197345733643 + }, + { + "auxiliary_loss_clip": 0.01134456, + "auxiliary_loss_mlp": 0.01108351, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00048995, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.6190062520533832, + "language_loss": 0.78383124, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80625939, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.5644984245300293 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01110354, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00068176, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 1.937687276620307, + "language_loss": 0.53971565, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56202179, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.6710615158081055 + }, + { + "auxiliary_loss_clip": 0.01133488, + "auxiliary_loss_mlp": 0.01108879, + "balance_loss_clip": 1.0019697, + "balance_loss_mlp": 1.00063694, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.6588759649891989, + "language_loss": 0.80333018, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82575387, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.5904765129089355 + }, + { + "auxiliary_loss_clip": 0.01133483, + "auxiliary_loss_mlp": 0.01109955, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00056887, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.54879833043248, + "language_loss": 0.68709499, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.70952934, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 2.603874683380127 + }, + { + "auxiliary_loss_clip": 0.01166552, + "auxiliary_loss_mlp": 0.01108333, + "balance_loss_clip": 1.00197232, + "balance_loss_mlp": 1.00047302, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.8257891031594993, + "language_loss": 0.71119845, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73394734, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.5310592651367188 + }, + { + "auxiliary_loss_clip": 0.0111718, + "auxiliary_loss_mlp": 0.01109357, + "balance_loss_clip": 1.00191891, + "balance_loss_mlp": 1.00044751, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.6638338471226697, + "language_loss": 0.78268349, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80494893, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.6030685901641846 + }, + { + "auxiliary_loss_clip": 0.01134622, + "auxiliary_loss_mlp": 0.01109558, + "balance_loss_clip": 1.00188267, + "balance_loss_mlp": 1.00074422, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.624296569155902, + "language_loss": 0.79555428, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81799603, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.6145310401916504 + }, + { + "auxiliary_loss_clip": 0.01150248, + "auxiliary_loss_mlp": 0.01108816, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.0005734, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.453155515472253, + "language_loss": 0.70363021, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72622085, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.5627784729003906 + }, + { + "auxiliary_loss_clip": 0.01149949, + "auxiliary_loss_mlp": 0.0110975, + "balance_loss_clip": 1.00197065, + "balance_loss_mlp": 1.00055397, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5602451753377586, + "language_loss": 0.71297133, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73556828, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.654195547103882 + }, + { + "auxiliary_loss_clip": 0.01132955, + "auxiliary_loss_mlp": 0.01108737, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00068569, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5971921525791968, + "language_loss": 0.67481202, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69722891, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.602971076965332 + }, + { + "auxiliary_loss_clip": 0.01120011, + "auxiliary_loss_mlp": 0.01109312, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00059319, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 2.1490363592439676, + "language_loss": 0.72574788, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74804115, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 3.9920332431793213 + }, + { + "auxiliary_loss_clip": 0.01150304, + "auxiliary_loss_mlp": 0.01109648, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00073838, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.5440987743013168, + "language_loss": 0.69292897, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71552849, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 2.60943341255188 + }, + { + "auxiliary_loss_clip": 0.01166677, + "auxiliary_loss_mlp": 0.01109577, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00057209, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 2.45217081412639, + "language_loss": 0.79029351, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81305611, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.4930126667022705 + }, + { + "auxiliary_loss_clip": 0.01145654, + "auxiliary_loss_mlp": 0.01087873, + "balance_loss_clip": 1.00125742, + "balance_loss_mlp": 1.0001353, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8090667728924439, + "language_loss": 0.61479294, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63712823, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.270834445953369 + }, + { + "auxiliary_loss_clip": 0.0113363, + "auxiliary_loss_mlp": 0.00747869, + "balance_loss_clip": 1.00185621, + "balance_loss_mlp": 1.00111079, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.7101115570109207, + "language_loss": 0.76423353, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78304851, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.6005053520202637 + }, + { + "auxiliary_loss_clip": 0.01166633, + "auxiliary_loss_mlp": 0.01109217, + "balance_loss_clip": 1.00203145, + "balance_loss_mlp": 1.00059307, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.7041665457007886, + "language_loss": 0.71517861, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73793709, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.5975773334503174 + }, + { + "auxiliary_loss_clip": 0.01166486, + "auxiliary_loss_mlp": 0.01108725, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00067377, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 1.9749213934419247, + "language_loss": 0.59637022, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61912233, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 2.522392749786377 + }, + { + "auxiliary_loss_clip": 0.01150122, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_clip": 1.0020138, + "balance_loss_mlp": 1.00054693, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 2.1182694742090615, + "language_loss": 0.75777447, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.7803731, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.567598342895508 + }, + { + "auxiliary_loss_clip": 0.01134731, + "auxiliary_loss_mlp": 0.01109338, + "balance_loss_clip": 1.00210321, + "balance_loss_mlp": 1.00061905, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4388561606803782, + "language_loss": 0.79212046, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81456113, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.5841572284698486 + }, + { + "auxiliary_loss_clip": 0.01135342, + "auxiliary_loss_mlp": 0.01109099, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00057101, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.8088340850767517, + "language_loss": 0.67594588, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69839031, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 3.9675393104553223 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01108821, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00067449, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 4.651842494820844, + "language_loss": 0.79017937, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81293392, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 2.5037946701049805 + }, + { + "auxiliary_loss_clip": 0.01166856, + "auxiliary_loss_mlp": 0.01111226, + "balance_loss_clip": 1.00206292, + "balance_loss_mlp": 1.00069559, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 2.1972059731207736, + "language_loss": 0.85996503, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.88274586, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 3.849210262298584 + }, + { + "auxiliary_loss_clip": 0.01166571, + "auxiliary_loss_mlp": 0.01108823, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00058091, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.7207143378894307, + "language_loss": 0.79355931, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81631321, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.674459218978882 + }, + { + "auxiliary_loss_clip": 0.01132627, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.00170004, + "balance_loss_mlp": 1.00050461, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 1.7644185362013258, + "language_loss": 0.68353677, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70596385, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.5669374465942383 + }, + { + "auxiliary_loss_clip": 0.01119651, + "auxiliary_loss_mlp": 0.01110523, + "balance_loss_clip": 1.00181341, + "balance_loss_mlp": 1.00056422, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.4675883153768232, + "language_loss": 0.79195106, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81425285, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.6806437969207764 + }, + { + "auxiliary_loss_clip": 0.01117918, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_clip": 1.00168002, + "balance_loss_mlp": 1.0006175, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.7539426933108422, + "language_loss": 0.66731346, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.68959457, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 4.02468204498291 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.0111029, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.0006175, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.700540066259792, + "language_loss": 0.55875146, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.58120811, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.562390089035034 + }, + { + "auxiliary_loss_clip": 0.01135174, + "auxiliary_loss_mlp": 0.01110457, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00059414, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 3.7256134701343973, + "language_loss": 0.66212165, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68457794, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.594339370727539 + }, + { + "auxiliary_loss_clip": 0.01149703, + "auxiliary_loss_mlp": 0.00747843, + "balance_loss_clip": 1.00178754, + "balance_loss_mlp": 1.00119567, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 1.9307804424644497, + "language_loss": 0.8295055, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.848481, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.5829226970672607 + }, + { + "auxiliary_loss_clip": 0.01134231, + "auxiliary_loss_mlp": 0.01109588, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00058293, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.7640718989523956, + "language_loss": 0.77148092, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79391909, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.5955419540405273 + }, + { + "auxiliary_loss_clip": 0.0115176, + "auxiliary_loss_mlp": 0.00747776, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00105, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 1.9433417277258116, + "language_loss": 0.75729072, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77628613, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.5426323413848877 + }, + { + "auxiliary_loss_clip": 0.01134863, + "auxiliary_loss_mlp": 0.01110258, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00058568, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.5448547361294052, + "language_loss": 0.66759425, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69004548, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.588813066482544 + }, + { + "auxiliary_loss_clip": 0.0113429, + "auxiliary_loss_mlp": 0.01109908, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00061703, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.8724179494595246, + "language_loss": 0.83936667, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86180872, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.5769574642181396 + }, + { + "auxiliary_loss_clip": 0.01166651, + "auxiliary_loss_mlp": 0.01110148, + "balance_loss_clip": 1.00212193, + "balance_loss_mlp": 1.00047576, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 2.0016652808621447, + "language_loss": 0.77876109, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.80152905, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.5037384033203125 + }, + { + "auxiliary_loss_clip": 0.01166763, + "auxiliary_loss_mlp": 0.01109882, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00078201, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.8868510256430187, + "language_loss": 0.80806196, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.83082843, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.4674811363220215 + }, + { + "auxiliary_loss_clip": 0.0112129, + "auxiliary_loss_mlp": 0.01108636, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00058436, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 11.933984514675476, + "language_loss": 0.83155102, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85385025, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.626533031463623 + }, + { + "auxiliary_loss_clip": 0.01145408, + "auxiliary_loss_mlp": 0.01087826, + "balance_loss_clip": 1.00145948, + "balance_loss_mlp": 1.00008762, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7060205080772815, + "language_loss": 0.62831777, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.65065008, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.2320802211761475 + }, + { + "auxiliary_loss_clip": 0.01151369, + "auxiliary_loss_mlp": 0.0110882, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00057793, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 6.76120760344977, + "language_loss": 0.82198328, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84458524, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.546933650970459 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.0111013, + "balance_loss_clip": 1.0020498, + "balance_loss_mlp": 1.00064838, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 2.027501290827476, + "language_loss": 0.7477867, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.77024192, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.619983434677124 + }, + { + "auxiliary_loss_clip": 0.01151571, + "auxiliary_loss_mlp": 0.01108907, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00047469, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5937482090193607, + "language_loss": 0.78461856, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80722338, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.5741357803344727 + }, + { + "auxiliary_loss_clip": 0.01166626, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_clip": 1.00198591, + "balance_loss_mlp": 1.00065064, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 5.379446555081884, + "language_loss": 0.83031034, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85307312, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.5169413089752197 + }, + { + "auxiliary_loss_clip": 0.01118238, + "auxiliary_loss_mlp": 0.01109385, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00066614, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.7277377710062587, + "language_loss": 0.75641608, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77869225, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.605705499649048 + }, + { + "auxiliary_loss_clip": 0.01151785, + "auxiliary_loss_mlp": 0.01109545, + "balance_loss_clip": 1.00204933, + "balance_loss_mlp": 1.00063586, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.4940257882961, + "language_loss": 0.73964596, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.7622593, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.52557373046875 + }, + { + "auxiliary_loss_clip": 0.01149355, + "auxiliary_loss_mlp": 0.01110078, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00050116, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.5093611861248222, + "language_loss": 0.68234992, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70494425, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.584566593170166 + }, + { + "auxiliary_loss_clip": 0.01151793, + "auxiliary_loss_mlp": 0.01109947, + "balance_loss_clip": 1.00194228, + "balance_loss_mlp": 1.00056088, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 1.9082692234931649, + "language_loss": 0.73768544, + "learning_rate": 1.377078777445467e-06, + "loss": 0.76030278, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.575216770172119 + }, + { + "auxiliary_loss_clip": 0.01115308, + "auxiliary_loss_mlp": 0.01108478, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.00061774, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.7106659592042908, + "language_loss": 0.83282208, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85505998, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.6155881881713867 + }, + { + "auxiliary_loss_clip": 0.01116276, + "auxiliary_loss_mlp": 0.0110898, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00054717, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.621745145482123, + "language_loss": 0.70131868, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.7235713, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.6466872692108154 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01088186, + "balance_loss_clip": 1.00123835, + "balance_loss_mlp": 1.00006664, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8285563237322311, + "language_loss": 0.58676076, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60877866, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 2.972513198852539 + }, + { + "auxiliary_loss_clip": 0.01134972, + "auxiliary_loss_mlp": 0.01108989, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00065207, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 1.8720207691530841, + "language_loss": 0.69404113, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71648073, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.554586172103882 + }, + { + "auxiliary_loss_clip": 0.01136787, + "auxiliary_loss_mlp": 0.01109284, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00056505, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.6831838798555545, + "language_loss": 0.71584427, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73830491, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 3.945350408554077 + }, + { + "auxiliary_loss_clip": 0.01151729, + "auxiliary_loss_mlp": 0.01109729, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00072384, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.817159228588516, + "language_loss": 0.78956735, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81218189, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 2.533639907836914 + }, + { + "auxiliary_loss_clip": 0.01118195, + "auxiliary_loss_mlp": 0.01110091, + "balance_loss_clip": 1.00173783, + "balance_loss_mlp": 1.000705, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.4072041990913187, + "language_loss": 0.74492145, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76720428, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.633009910583496 + }, + { + "auxiliary_loss_clip": 0.0113329, + "auxiliary_loss_mlp": 0.01110208, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00072634, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.5441388831241456, + "language_loss": 0.61999512, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64243007, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.637422800064087 + }, + { + "auxiliary_loss_clip": 0.0113548, + "auxiliary_loss_mlp": 0.01109598, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00059342, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.8367529753422966, + "language_loss": 0.68681139, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70926213, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.5938308238983154 + }, + { + "auxiliary_loss_clip": 0.01134844, + "auxiliary_loss_mlp": 0.0110975, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00055468, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 2.0891080225329737, + "language_loss": 0.83352256, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85596848, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.6189544200897217 + }, + { + "auxiliary_loss_clip": 0.01162367, + "auxiliary_loss_mlp": 0.01087771, + "balance_loss_clip": 1.00129461, + "balance_loss_mlp": 1.0000329, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.903067220670831, + "language_loss": 0.67085969, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69336104, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.1030173301696777 + }, + { + "auxiliary_loss_clip": 0.01151826, + "auxiliary_loss_mlp": 0.01109268, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.0005492, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.480130675752491, + "language_loss": 0.60860777, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63121867, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.7475204467773438 + }, + { + "auxiliary_loss_clip": 0.01117739, + "auxiliary_loss_mlp": 0.01109224, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00060081, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.6503350573040394, + "language_loss": 0.72248745, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74475706, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 4.03745698928833 + }, + { + "auxiliary_loss_clip": 0.01149723, + "auxiliary_loss_mlp": 0.01108928, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.0004952, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.7241580558869378, + "language_loss": 0.76038969, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78297615, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 3.9719457626342773 + }, + { + "auxiliary_loss_clip": 0.01104947, + "auxiliary_loss_mlp": 0.01109387, + "balance_loss_clip": 1.00175798, + "balance_loss_mlp": 1.00047719, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 1.971514359590722, + "language_loss": 0.75351292, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77565628, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 2.681893825531006 + }, + { + "auxiliary_loss_clip": 0.01149953, + "auxiliary_loss_mlp": 0.01110143, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00075674, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.2764183932096795, + "language_loss": 0.82581615, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.8484171, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.4886884689331055 + }, + { + "auxiliary_loss_clip": 0.01134814, + "auxiliary_loss_mlp": 0.01110249, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00057697, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.648615674378219, + "language_loss": 0.72667766, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74912834, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.6849381923675537 + }, + { + "auxiliary_loss_clip": 0.01166664, + "auxiliary_loss_mlp": 0.0111033, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00084877, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.5645939457227036, + "language_loss": 0.74213904, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76490897, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.5418694019317627 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.01088229, + "balance_loss_clip": 1.00128961, + "balance_loss_mlp": 1.0001092, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8604236416156603, + "language_loss": 0.64971274, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67177826, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 4.755095958709717 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.00747827, + "balance_loss_clip": 1.00168741, + "balance_loss_mlp": 1.00101447, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.5653573371738907, + "language_loss": 0.75726461, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77607262, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.612208127975464 + }, + { + "auxiliary_loss_clip": 0.01134871, + "auxiliary_loss_mlp": 0.0111012, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00082922, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.4944379684837146, + "language_loss": 0.73816353, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76061344, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.6376118659973145 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.01110481, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00071359, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.6982019142257836, + "language_loss": 0.72991133, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75236076, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.6295957565307617 + }, + { + "auxiliary_loss_clip": 0.01166693, + "auxiliary_loss_mlp": 0.01109953, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00047147, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.5553915275583683, + "language_loss": 0.74521732, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76798373, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.5160133838653564 + }, + { + "auxiliary_loss_clip": 0.0114973, + "auxiliary_loss_mlp": 0.01109196, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00057244, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.6036303075374483, + "language_loss": 0.77994645, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80253565, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.5894830226898193 + }, + { + "auxiliary_loss_clip": 0.01166688, + "auxiliary_loss_mlp": 0.01110616, + "balance_loss_clip": 1.00203741, + "balance_loss_mlp": 1.00065696, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.0439195068104254, + "language_loss": 0.79661745, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.81939048, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.4609434604644775 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.0110886, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00052309, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.2734555479946192, + "language_loss": 0.7871539, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80958462, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.5939886569976807 + }, + { + "auxiliary_loss_clip": 0.01151328, + "auxiliary_loss_mlp": 0.01108677, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00043535, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.5403869052135124, + "language_loss": 0.8201791, + "learning_rate": 1.367095017101569e-06, + "loss": 0.8427791, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.5617337226867676 + }, + { + "auxiliary_loss_clip": 0.01150195, + "auxiliary_loss_mlp": 0.0111005, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00056863, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 3.0632134669709394, + "language_loss": 0.66230655, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.68490899, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.7617485523223877 + }, + { + "auxiliary_loss_clip": 0.01151665, + "auxiliary_loss_mlp": 0.01109635, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00043952, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 2.3384475152059796, + "language_loss": 0.71743107, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.74004412, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.530381441116333 + }, + { + "auxiliary_loss_clip": 0.01100294, + "auxiliary_loss_mlp": 0.0110914, + "balance_loss_clip": 1.00148094, + "balance_loss_mlp": 1.00061142, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.8588534526472782, + "language_loss": 0.79641318, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81850755, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.655742645263672 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01110356, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00068343, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.8576209610778334, + "language_loss": 0.7596873, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78213918, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.5933773517608643 + }, + { + "auxiliary_loss_clip": 0.01135143, + "auxiliary_loss_mlp": 0.01110609, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00074565, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.11636840590314, + "language_loss": 0.78424919, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80670667, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.5745513439178467 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.01108615, + "balance_loss_clip": 1.00170553, + "balance_loss_mlp": 1.00056338, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.2622664223554485, + "language_loss": 0.66278422, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68504709, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 2.9543347358703613 + }, + { + "auxiliary_loss_clip": 0.01150279, + "auxiliary_loss_mlp": 0.0074781, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00091422, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.7852570575363815, + "language_loss": 0.63522393, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65420479, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.6589274406433105 + }, + { + "auxiliary_loss_clip": 0.01134886, + "auxiliary_loss_mlp": 0.01110595, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00054073, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.660831067908615, + "language_loss": 0.75036716, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77282196, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.5810182094573975 + }, + { + "auxiliary_loss_clip": 0.01088717, + "auxiliary_loss_mlp": 0.01110217, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.00054479, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.0262415510121445, + "language_loss": 0.62187409, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64386344, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.720228672027588 + }, + { + "auxiliary_loss_clip": 0.01136835, + "auxiliary_loss_mlp": 0.01109297, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00067401, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.5880210378377753, + "language_loss": 0.74235785, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76481926, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.618107795715332 + }, + { + "auxiliary_loss_clip": 0.01166624, + "auxiliary_loss_mlp": 0.01110164, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00077772, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.7516644125907963, + "language_loss": 0.78059965, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80336761, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.5403287410736084 + }, + { + "auxiliary_loss_clip": 0.01134992, + "auxiliary_loss_mlp": 0.0110997, + "balance_loss_clip": 1.00193918, + "balance_loss_mlp": 1.00048804, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.4516165696853012, + "language_loss": 0.72941589, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75186551, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.652172565460205 + }, + { + "auxiliary_loss_clip": 0.01134707, + "auxiliary_loss_mlp": 0.01109515, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00050974, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.5216567890363464, + "language_loss": 0.69601822, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71846044, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.6731302738189697 + }, + { + "auxiliary_loss_clip": 0.01151763, + "auxiliary_loss_mlp": 0.00747714, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.00096869, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.784654228556481, + "language_loss": 0.91627258, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93526733, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.5852322578430176 + }, + { + "auxiliary_loss_clip": 0.01133587, + "auxiliary_loss_mlp": 0.01109649, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00064456, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 2.106621198886147, + "language_loss": 0.7119174, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73434973, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 4.056972026824951 + }, + { + "auxiliary_loss_clip": 0.01150205, + "auxiliary_loss_mlp": 0.00747813, + "balance_loss_clip": 1.0018127, + "balance_loss_mlp": 1.00115013, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 2.005311934829819, + "language_loss": 0.67364144, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69262159, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.604097366333008 + }, + { + "auxiliary_loss_clip": 0.01149407, + "auxiliary_loss_mlp": 0.01110346, + "balance_loss_clip": 1.0021044, + "balance_loss_mlp": 1.00057817, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.9360336693423086, + "language_loss": 0.81287718, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83547473, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.605394124984741 + }, + { + "auxiliary_loss_clip": 0.01166659, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00046241, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.346832659215971, + "language_loss": 0.80205607, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82482213, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 2.5226659774780273 + }, + { + "auxiliary_loss_clip": 0.01150176, + "auxiliary_loss_mlp": 0.01109901, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00061011, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.5478730228139759, + "language_loss": 0.75969112, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78229189, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.583211660385132 + }, + { + "auxiliary_loss_clip": 0.01083394, + "auxiliary_loss_mlp": 0.0108825, + "balance_loss_clip": 1.00118494, + "balance_loss_mlp": 1.00013018, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7606989238989961, + "language_loss": 0.57701695, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59873343, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.3830294609069824 + }, + { + "auxiliary_loss_clip": 0.0113606, + "auxiliary_loss_mlp": 0.01109444, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00063014, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 2.132701494832985, + "language_loss": 0.77681708, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79927218, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.761723279953003 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.01110235, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00065756, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.503350590664125, + "language_loss": 0.71759129, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.74035984, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.4735677242279053 + }, + { + "auxiliary_loss_clip": 0.01166436, + "auxiliary_loss_mlp": 0.01108659, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00060821, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 2.3168953662801943, + "language_loss": 0.7184822, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74123317, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.5114994049072266 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01109485, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00057578, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 2.412268243875912, + "language_loss": 0.72047508, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.7430824, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 3.960827350616455 + }, + { + "auxiliary_loss_clip": 0.01145633, + "auxiliary_loss_mlp": 0.01087749, + "balance_loss_clip": 1.00116825, + "balance_loss_mlp": 1.00001132, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7598966373327014, + "language_loss": 0.56866211, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.59099591, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 4.519827604293823 + }, + { + "auxiliary_loss_clip": 0.01166523, + "auxiliary_loss_mlp": 0.01109947, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00065577, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.6903796033131726, + "language_loss": 0.63707203, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65983671, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.598165988922119 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.01108942, + "balance_loss_clip": 1.00184095, + "balance_loss_mlp": 1.0005089, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.6610420391274634, + "language_loss": 0.79050088, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81262708, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.6829473972320557 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.00747917, + "balance_loss_clip": 1.0018822, + "balance_loss_mlp": 1.00106692, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.796877496356203, + "language_loss": 0.86958551, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.8882525, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 2.598196268081665 + }, + { + "auxiliary_loss_clip": 0.01087067, + "auxiliary_loss_mlp": 0.01110373, + "balance_loss_clip": 1.00165009, + "balance_loss_mlp": 1.00060499, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.5962865984158605, + "language_loss": 0.80322647, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82520086, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.6943562030792236 + }, + { + "auxiliary_loss_clip": 0.01104029, + "auxiliary_loss_mlp": 0.01108769, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00052738, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.7192067882982325, + "language_loss": 0.86829197, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89041996, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 4.1517674922943115 + }, + { + "auxiliary_loss_clip": 0.01166543, + "auxiliary_loss_mlp": 0.01109837, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00045049, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.0646870985072785, + "language_loss": 0.68873692, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71150076, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.6839120388031006 + }, + { + "auxiliary_loss_clip": 0.01135251, + "auxiliary_loss_mlp": 0.01108441, + "balance_loss_clip": 1.00191033, + "balance_loss_mlp": 1.00038946, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 2.355975912115587, + "language_loss": 0.73781312, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76025009, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.5659170150756836 + }, + { + "auxiliary_loss_clip": 0.01151537, + "auxiliary_loss_mlp": 0.01109352, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.00044215, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.059487936921775, + "language_loss": 0.67873681, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70134568, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.529345989227295 + }, + { + "auxiliary_loss_clip": 0.01064012, + "auxiliary_loss_mlp": 0.01087762, + "balance_loss_clip": 1.00107574, + "balance_loss_mlp": 1.0000242, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8874162536449319, + "language_loss": 0.57766449, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59918225, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.457927703857422 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01108798, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.0004611, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.4540297691526844, + "language_loss": 0.79742175, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81985575, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 2.9270780086517334 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01110466, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.000507, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.7458116772204275, + "language_loss": 0.80558568, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82802969, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.6251370906829834 + }, + { + "auxiliary_loss_clip": 0.01134616, + "auxiliary_loss_mlp": 0.01109737, + "balance_loss_clip": 1.00186813, + "balance_loss_mlp": 1.00063694, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 2.0067844393062937, + "language_loss": 0.65064347, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67308694, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.664264678955078 + }, + { + "auxiliary_loss_clip": 0.01149678, + "auxiliary_loss_mlp": 0.01109223, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00069475, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 3.450198587501731, + "language_loss": 0.71515375, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73774278, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.567070484161377 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01110218, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00054598, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.6739465746516693, + "language_loss": 0.71901232, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74147034, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.6671013832092285 + }, + { + "auxiliary_loss_clip": 0.01136882, + "auxiliary_loss_mlp": 0.01109218, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00049877, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.301812810672398, + "language_loss": 0.63828111, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.6607421, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.658820629119873 + }, + { + "auxiliary_loss_clip": 0.01116227, + "auxiliary_loss_mlp": 0.01109339, + "balance_loss_clip": 1.00189531, + "balance_loss_mlp": 1.00062001, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.7789092781332223, + "language_loss": 0.71424991, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73650563, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.610275983810425 + }, + { + "auxiliary_loss_clip": 0.01150028, + "auxiliary_loss_mlp": 0.01110937, + "balance_loss_clip": 1.00207198, + "balance_loss_mlp": 1.00069225, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 2.170828802770621, + "language_loss": 0.68518198, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70779163, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.654489755630493 + }, + { + "auxiliary_loss_clip": 0.0111567, + "auxiliary_loss_mlp": 0.01109439, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00062513, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.9090519061337576, + "language_loss": 0.71243143, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73468256, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.6468379497528076 + }, + { + "auxiliary_loss_clip": 0.01136477, + "auxiliary_loss_mlp": 0.01110504, + "balance_loss_clip": 1.00197077, + "balance_loss_mlp": 1.00045002, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.8457216217212933, + "language_loss": 0.69886261, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72133237, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.721059799194336 + }, + { + "auxiliary_loss_clip": 0.01087148, + "auxiliary_loss_mlp": 0.01109905, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00061357, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 1.9716182120681387, + "language_loss": 0.7603367, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78230727, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.7183027267456055 + }, + { + "auxiliary_loss_clip": 0.01166554, + "auxiliary_loss_mlp": 0.01109524, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00061464, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.7305292360375772, + "language_loss": 0.84986991, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87263066, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.516026020050049 + }, + { + "auxiliary_loss_clip": 0.0108983, + "auxiliary_loss_mlp": 0.01109122, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.00059438, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.6597899524081874, + "language_loss": 0.64514381, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66713333, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.8052215576171875 + }, + { + "auxiliary_loss_clip": 0.01118198, + "auxiliary_loss_mlp": 0.01108752, + "balance_loss_clip": 1.00165892, + "balance_loss_mlp": 1.0005101, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.6374328951544523, + "language_loss": 0.75693679, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77920628, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01120359, + "auxiliary_loss_mlp": 0.01110517, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00046301, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.6520253599286894, + "language_loss": 0.74958384, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77189261, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.690289258956909 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01109929, + "balance_loss_clip": 1.00189054, + "balance_loss_mlp": 1.00063789, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.599471580872382, + "language_loss": 0.7557832, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77821493, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.614746332168579 + }, + { + "auxiliary_loss_clip": 0.0116639, + "auxiliary_loss_mlp": 0.01109323, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00060415, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.0860462491051663, + "language_loss": 0.7609427, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78369987, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.5015103816986084 + }, + { + "auxiliary_loss_clip": 0.01134564, + "auxiliary_loss_mlp": 0.01109281, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00056171, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.726469145502802, + "language_loss": 0.82134199, + "learning_rate": 1.347916569325736e-06, + "loss": 0.8437804, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 4.321917295455933 + }, + { + "auxiliary_loss_clip": 0.01166703, + "auxiliary_loss_mlp": 0.00747862, + "balance_loss_clip": 1.00205767, + "balance_loss_mlp": 1.00099826, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.5425207640052323, + "language_loss": 0.77029693, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.78944254, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.5763099193573 + }, + { + "auxiliary_loss_clip": 0.01115, + "auxiliary_loss_mlp": 0.01087057, + "balance_loss_clip": 1.00137687, + "balance_loss_mlp": 1.00008225, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8080931602053846, + "language_loss": 0.59084338, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61286396, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.126324415206909 + }, + { + "auxiliary_loss_clip": 0.01136549, + "auxiliary_loss_mlp": 0.01109474, + "balance_loss_clip": 1.0018872, + "balance_loss_mlp": 1.00056446, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.466471020600483, + "language_loss": 0.7261399, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.74860013, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.615783452987671 + }, + { + "auxiliary_loss_clip": 0.01151367, + "auxiliary_loss_mlp": 0.00747836, + "balance_loss_clip": 1.00199544, + "balance_loss_mlp": 1.00094366, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 2.092782352946205, + "language_loss": 0.77349782, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79248977, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.580836534500122 + }, + { + "auxiliary_loss_clip": 0.01116369, + "auxiliary_loss_mlp": 0.01109277, + "balance_loss_clip": 1.00159264, + "balance_loss_mlp": 1.0005585, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6888703295810914, + "language_loss": 0.7969982, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81925464, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.666872024536133 + }, + { + "auxiliary_loss_clip": 0.01088531, + "auxiliary_loss_mlp": 0.01109355, + "balance_loss_clip": 1.00178397, + "balance_loss_mlp": 1.00054073, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.8267536692694224, + "language_loss": 0.80595732, + "learning_rate": 1.345707936733612e-06, + "loss": 0.82793617, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 2.6950860023498535 + }, + { + "auxiliary_loss_clip": 0.01120384, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00052238, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.7942357260338448, + "language_loss": 0.81931651, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.84162229, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.65635347366333 + }, + { + "auxiliary_loss_clip": 0.01100789, + "auxiliary_loss_mlp": 0.00747685, + "balance_loss_clip": 1.00163651, + "balance_loss_mlp": 1.00084305, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.8584628589496426, + "language_loss": 0.7379958, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75648057, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.751683473587036 + }, + { + "auxiliary_loss_clip": 0.01151607, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00043011, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5773363896577408, + "language_loss": 0.70784897, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73045945, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 4.092168807983398 + }, + { + "auxiliary_loss_clip": 0.0116656, + "auxiliary_loss_mlp": 0.01109492, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.0005827, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.5280903132731913, + "language_loss": 0.72861576, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75137627, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 4.039320468902588 + }, + { + "auxiliary_loss_clip": 0.01135002, + "auxiliary_loss_mlp": 0.01107318, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00069714, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.457227943348463, + "language_loss": 0.77133507, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.79375827, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 2.624917984008789 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01110539, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00048447, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.802206481530343, + "language_loss": 0.68917578, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71162999, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.6281065940856934 + }, + { + "auxiliary_loss_clip": 0.01151826, + "auxiliary_loss_mlp": 0.01110043, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00046587, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.585579713028422, + "language_loss": 0.74940342, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77202213, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.5535309314727783 + }, + { + "auxiliary_loss_clip": 0.01149579, + "auxiliary_loss_mlp": 0.01108957, + "balance_loss_clip": 1.00199604, + "balance_loss_mlp": 1.00061965, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.528643925966495, + "language_loss": 0.75328338, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77586877, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.5754692554473877 + }, + { + "auxiliary_loss_clip": 0.01117976, + "auxiliary_loss_mlp": 0.0110978, + "balance_loss_clip": 1.00171351, + "balance_loss_mlp": 1.00058413, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.654422107027076, + "language_loss": 0.72808635, + "learning_rate": 1.342396663517503e-06, + "loss": 0.75036395, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 4.004188776016235 + }, + { + "auxiliary_loss_clip": 0.01166479, + "auxiliary_loss_mlp": 0.01109415, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00050592, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.6158344458905973, + "language_loss": 0.75882405, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78158295, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.505789279937744 + }, + { + "auxiliary_loss_clip": 0.01118726, + "auxiliary_loss_mlp": 0.01109159, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00072575, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 2.779511749692727, + "language_loss": 0.73081732, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75309622, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.635591745376587 + }, + { + "auxiliary_loss_clip": 0.0115151, + "auxiliary_loss_mlp": 0.01108689, + "balance_loss_clip": 1.00197852, + "balance_loss_mlp": 1.00054252, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4916505300662706, + "language_loss": 0.7317068, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.75430882, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.7394769191741943 + }, + { + "auxiliary_loss_clip": 0.0113521, + "auxiliary_loss_mlp": 0.01109303, + "balance_loss_clip": 1.00189865, + "balance_loss_mlp": 1.00067985, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5342593327512768, + "language_loss": 0.79439521, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81684029, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.6102869510650635 + }, + { + "auxiliary_loss_clip": 0.01149368, + "auxiliary_loss_mlp": 0.01110216, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00054348, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6183114040513875, + "language_loss": 0.81304151, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83563733, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.5869266986846924 + }, + { + "auxiliary_loss_clip": 0.01166561, + "auxiliary_loss_mlp": 0.01109025, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00049663, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.5612281100316405, + "language_loss": 0.77425027, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79700613, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 2.544081926345825 + }, + { + "auxiliary_loss_clip": 0.01137374, + "auxiliary_loss_mlp": 0.01110612, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00065315, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.5866147717762327, + "language_loss": 0.73027724, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75275707, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.6117513179779053 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.00747654, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00092053, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.5745411245694578, + "language_loss": 0.82863384, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.84728765, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.665114402770996 + }, + { + "auxiliary_loss_clip": 0.01134917, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_clip": 1.00184929, + "balance_loss_mlp": 1.0005759, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 1.923926737345416, + "language_loss": 0.7078225, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73026085, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.5590920448303223 + }, + { + "auxiliary_loss_clip": 0.01166667, + "auxiliary_loss_mlp": 0.01109203, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.00077045, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.5241064876798585, + "language_loss": 0.70211154, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.7248702, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.6400513648986816 + }, + { + "auxiliary_loss_clip": 0.01118178, + "auxiliary_loss_mlp": 0.01109808, + "balance_loss_clip": 1.00184214, + "balance_loss_mlp": 1.00061226, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 1.73510187944164, + "language_loss": 0.71450955, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73678941, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.672574996948242 + }, + { + "auxiliary_loss_clip": 0.01162266, + "auxiliary_loss_mlp": 0.01087745, + "balance_loss_clip": 1.00130749, + "balance_loss_mlp": 1.00000691, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8811071541621093, + "language_loss": 0.64118421, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66368425, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 2.9951677322387695 + }, + { + "auxiliary_loss_clip": 0.01166602, + "auxiliary_loss_mlp": 0.0110925, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.00062633, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 2.0992283056477206, + "language_loss": 0.74382091, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76657945, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.5092506408691406 + }, + { + "auxiliary_loss_clip": 0.01149926, + "auxiliary_loss_mlp": 0.01110598, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00054431, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.5499955724678778, + "language_loss": 0.68035853, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70296371, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.507505178451538 + }, + { + "auxiliary_loss_clip": 0.01150041, + "auxiliary_loss_mlp": 0.00747706, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.0009526, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 3.026014718189143, + "language_loss": 0.66614914, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.6851266, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 2.5007638931274414 + }, + { + "auxiliary_loss_clip": 0.01116561, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_clip": 1.00163484, + "balance_loss_mlp": 1.00054657, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.558837698487701, + "language_loss": 0.72947258, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75173181, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.6870038509368896 + }, + { + "auxiliary_loss_clip": 0.01133553, + "auxiliary_loss_mlp": 0.01109475, + "balance_loss_clip": 1.00186229, + "balance_loss_mlp": 1.00037479, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 15.855248998162727, + "language_loss": 0.80086178, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82329208, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.545424461364746 + }, + { + "auxiliary_loss_clip": 0.01166706, + "auxiliary_loss_mlp": 0.01109971, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00048923, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.5605493136001136, + "language_loss": 0.76556599, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78833282, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.492105722427368 + }, + { + "auxiliary_loss_clip": 0.01117593, + "auxiliary_loss_mlp": 0.01110469, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00051069, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 7.844512631317684, + "language_loss": 0.76781094, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79009163, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.637603759765625 + }, + { + "auxiliary_loss_clip": 0.01151912, + "auxiliary_loss_mlp": 0.01109885, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.00049853, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6254170448766867, + "language_loss": 0.7879746, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81059253, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01102358, + "auxiliary_loss_mlp": 0.01108077, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00040674, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.5462219785011406, + "language_loss": 0.79969013, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82179451, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.682875633239746 + }, + { + "auxiliary_loss_clip": 0.01113941, + "auxiliary_loss_mlp": 0.01087798, + "balance_loss_clip": 1.00129986, + "balance_loss_mlp": 1.00005996, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8089849248003365, + "language_loss": 0.59370017, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61571753, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 4.626006841659546 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.01108671, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00052428, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.6520529973670772, + "language_loss": 0.68160307, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70401907, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.6672844886779785 + }, + { + "auxiliary_loss_clip": 0.01135181, + "auxiliary_loss_mlp": 0.01108707, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00075126, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.4967312317591719, + "language_loss": 0.72124034, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74367917, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.5843584537506104 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.01109006, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00057352, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 2.2206896127509603, + "language_loss": 0.78778267, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81021059, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.5847995281219482 + }, + { + "auxiliary_loss_clip": 0.01119096, + "auxiliary_loss_mlp": 0.01109621, + "balance_loss_clip": 1.00158799, + "balance_loss_mlp": 1.00052047, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.698523706839704, + "language_loss": 0.72968984, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75197703, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.592289686203003 + }, + { + "auxiliary_loss_clip": 0.01100226, + "auxiliary_loss_mlp": 0.01109719, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00052333, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 1.7843045438659075, + "language_loss": 0.7221148, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.7442143, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.6642415523529053 + }, + { + "auxiliary_loss_clip": 0.01151957, + "auxiliary_loss_mlp": 0.01110078, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00050104, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8492052260477498, + "language_loss": 0.77942479, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80204517, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 2.517768144607544 + }, + { + "auxiliary_loss_clip": 0.01149747, + "auxiliary_loss_mlp": 0.01109362, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00054765, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 2.0333329590118985, + "language_loss": 0.78359294, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80618405, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 2.5237350463867188 + }, + { + "auxiliary_loss_clip": 0.01116714, + "auxiliary_loss_mlp": 0.01110089, + "balance_loss_clip": 1.00182188, + "balance_loss_mlp": 1.00060761, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.6447345799487536, + "language_loss": 0.76005441, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78232247, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.629263162612915 + }, + { + "auxiliary_loss_clip": 0.01166505, + "auxiliary_loss_mlp": 0.01109086, + "balance_loss_clip": 1.00188386, + "balance_loss_mlp": 1.00055826, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.8878504795764666, + "language_loss": 0.77273476, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.5656015872955322 + }, + { + "auxiliary_loss_clip": 0.01131423, + "auxiliary_loss_mlp": 0.01087393, + "balance_loss_clip": 1.00133002, + "balance_loss_mlp": 1.00003672, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6869906562668321, + "language_loss": 0.5906539, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61284208, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 4.603134393692017 + }, + { + "auxiliary_loss_clip": 0.01135333, + "auxiliary_loss_mlp": 0.01110367, + "balance_loss_clip": 1.00206888, + "balance_loss_mlp": 1.00069499, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.9368737559165972, + "language_loss": 0.78143895, + "learning_rate": 1.330272686582143e-06, + "loss": 0.80389595, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 4.063633680343628 + }, + { + "auxiliary_loss_clip": 0.01134049, + "auxiliary_loss_mlp": 0.01108895, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00065315, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 1.688434444879073, + "language_loss": 0.66309798, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68552744, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.586137533187866 + }, + { + "auxiliary_loss_clip": 0.0111782, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.00046873, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.9554237372952328, + "language_loss": 0.76136428, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78362191, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.5864930152893066 + }, + { + "auxiliary_loss_clip": 0.01132996, + "auxiliary_loss_mlp": 0.01108983, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00055075, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.9360983065296893, + "language_loss": 0.73342073, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75584054, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.6079349517822266 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01108605, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00055349, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.9633115355690336, + "language_loss": 0.72729301, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74958169, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.644702911376953 + }, + { + "auxiliary_loss_clip": 0.01151389, + "auxiliary_loss_mlp": 0.011099, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00051308, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 3.4972209861534655, + "language_loss": 0.58834618, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61095905, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 4.004282236099243 + }, + { + "auxiliary_loss_clip": 0.0111801, + "auxiliary_loss_mlp": 0.01109793, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.000597, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.8160830464123046, + "language_loss": 0.76687562, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78915364, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.597996234893799 + }, + { + "auxiliary_loss_clip": 0.01149869, + "auxiliary_loss_mlp": 0.01109842, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00035977, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.7731788937614037, + "language_loss": 0.72318196, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74577904, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.552851438522339 + }, + { + "auxiliary_loss_clip": 0.01149892, + "auxiliary_loss_mlp": 0.01110118, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00063586, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.3532873862910453, + "language_loss": 0.74319458, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.76579463, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 2.5471956729888916 + }, + { + "auxiliary_loss_clip": 0.01133368, + "auxiliary_loss_mlp": 0.01110574, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00052011, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.398631620490248, + "language_loss": 0.79712141, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81956077, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.536701202392578 + }, + { + "auxiliary_loss_clip": 0.01118007, + "auxiliary_loss_mlp": 0.01109083, + "balance_loss_clip": 1.00169098, + "balance_loss_mlp": 1.00065041, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.7000233208973645, + "language_loss": 0.77906132, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80133218, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.6285672187805176 + }, + { + "auxiliary_loss_clip": 0.01145142, + "auxiliary_loss_mlp": 0.01087473, + "balance_loss_clip": 1.00120234, + "balance_loss_mlp": 1.00011683, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8369057097144273, + "language_loss": 0.62174207, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64406824, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.0650715827941895 + }, + { + "auxiliary_loss_clip": 0.01151386, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.0006969, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.163257410988904, + "language_loss": 0.77311873, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79573727, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.571692705154419 + }, + { + "auxiliary_loss_clip": 0.01166646, + "auxiliary_loss_mlp": 0.0110989, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 1.8023019797589206, + "language_loss": 0.67940819, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70217359, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.4892382621765137 + }, + { + "auxiliary_loss_clip": 0.0111766, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_clip": 1.0017128, + "balance_loss_mlp": 1.00056231, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 6.367739561902908, + "language_loss": 0.76426572, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78653705, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.620600700378418 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.01109575, + "balance_loss_clip": 1.00160241, + "balance_loss_mlp": 1.00056958, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.1529924950079566, + "language_loss": 0.6917181, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71414518, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.577488660812378 + }, + { + "auxiliary_loss_clip": 0.01132978, + "auxiliary_loss_mlp": 0.00747721, + "balance_loss_clip": 1.00181901, + "balance_loss_mlp": 1.00098944, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 2.2627463103817513, + "language_loss": 0.69711757, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.7159245, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.6790528297424316 + }, + { + "auxiliary_loss_clip": 0.01104885, + "auxiliary_loss_mlp": 0.01109367, + "balance_loss_clip": 1.00169551, + "balance_loss_mlp": 1.00045753, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.8276534291873472, + "language_loss": 0.79812968, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82027215, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.688739061355591 + }, + { + "auxiliary_loss_clip": 0.01166372, + "auxiliary_loss_mlp": 0.01108564, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00051296, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.843383416760665, + "language_loss": 0.73569804, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75844741, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.5133249759674072 + }, + { + "auxiliary_loss_clip": 0.01166526, + "auxiliary_loss_mlp": 0.01110744, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00068998, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.942015491845541, + "language_loss": 0.63230288, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65507555, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.5478267669677734 + }, + { + "auxiliary_loss_clip": 0.01149831, + "auxiliary_loss_mlp": 0.01110145, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00056779, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.6699875181756818, + "language_loss": 0.71967351, + "learning_rate": 1.322938249724991e-06, + "loss": 0.74227333, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.5398995876312256 + }, + { + "auxiliary_loss_clip": 0.01102515, + "auxiliary_loss_mlp": 0.01108816, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.0004791, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.6305040040715668, + "language_loss": 0.69379318, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.7159065, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.649139165878296 + }, + { + "auxiliary_loss_clip": 0.01118582, + "auxiliary_loss_mlp": 0.01108447, + "balance_loss_clip": 1.00164974, + "balance_loss_mlp": 1.00039625, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 1.9621985137904419, + "language_loss": 0.6867072, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70897746, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.6162660121917725 + }, + { + "auxiliary_loss_clip": 0.01149791, + "auxiliary_loss_mlp": 0.01110029, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00045228, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.195127794937604, + "language_loss": 0.80720448, + "learning_rate": 1.321838967240299e-06, + "loss": 0.82980263, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.5167620182037354 + }, + { + "auxiliary_loss_clip": 0.01133303, + "auxiliary_loss_mlp": 0.0108739, + "balance_loss_clip": 1.00141203, + "balance_loss_mlp": 1.00003362, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.778397319128724, + "language_loss": 0.57310957, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59531653, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.0817060470581055 + }, + { + "auxiliary_loss_clip": 0.01119979, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00053716, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.7982184240055685, + "language_loss": 0.7314682, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75375199, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.688784122467041 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01109521, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00061166, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.7901930421196124, + "language_loss": 0.59877378, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62136614, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.5802245140075684 + }, + { + "auxiliary_loss_clip": 0.01089148, + "auxiliary_loss_mlp": 0.01108258, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00039721, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 5.282255881647408, + "language_loss": 0.77990365, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80187774, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 4.085718154907227 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01109568, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00046778, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.5887341456684478, + "language_loss": 0.71432114, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73660171, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.6800572872161865 + }, + { + "auxiliary_loss_clip": 0.0114912, + "auxiliary_loss_mlp": 0.01109018, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00048947, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.7740417912708704, + "language_loss": 0.7175824, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.7401638, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.5692214965820312 + }, + { + "auxiliary_loss_clip": 0.01114689, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_clip": 1.00149059, + "balance_loss_mlp": 1.00001383, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8134358323983839, + "language_loss": 0.54153162, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.5635522, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 3.1883270740509033 + }, + { + "auxiliary_loss_clip": 0.01120018, + "auxiliary_loss_mlp": 0.01110042, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00036991, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.026391350992175, + "language_loss": 0.69666433, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71896493, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.616481065750122 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01109651, + "balance_loss_clip": 1.0020591, + "balance_loss_mlp": 1.00064611, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.8198818963856744, + "language_loss": 0.56880867, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59157062, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.509077548980713 + }, + { + "auxiliary_loss_clip": 0.01145069, + "auxiliary_loss_mlp": 0.01087382, + "balance_loss_clip": 1.00129628, + "balance_loss_mlp": 1.00002575, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8154543922945646, + "language_loss": 0.61162984, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63395435, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.052487850189209 + }, + { + "auxiliary_loss_clip": 0.01166446, + "auxiliary_loss_mlp": 0.01108616, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00046921, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.940792112245384, + "language_loss": 0.81797725, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84072781, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.5138933658599854 + }, + { + "auxiliary_loss_clip": 0.0114966, + "auxiliary_loss_mlp": 0.01108814, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00066745, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.466068527861143, + "language_loss": 0.75593603, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.7785207, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.5855460166931152 + }, + { + "auxiliary_loss_clip": 0.01117651, + "auxiliary_loss_mlp": 0.01109265, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00054634, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.4230274527742386, + "language_loss": 0.7847085, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80697769, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 4.003844261169434 + }, + { + "auxiliary_loss_clip": 0.01150139, + "auxiliary_loss_mlp": 0.01108881, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.0006392, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.504641482917378, + "language_loss": 0.77989554, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80248582, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.572253465652466 + }, + { + "auxiliary_loss_clip": 0.01134088, + "auxiliary_loss_mlp": 0.00747803, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00093043, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 1.793966643436555, + "language_loss": 0.67527032, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69408923, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 4.019026041030884 + }, + { + "auxiliary_loss_clip": 0.01135171, + "auxiliary_loss_mlp": 0.01109689, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00049281, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.4686206370325854, + "language_loss": 0.75852239, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78097093, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.58978533744812 + }, + { + "auxiliary_loss_clip": 0.01133177, + "auxiliary_loss_mlp": 0.01109802, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00051093, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 1.9458105290201626, + "language_loss": 0.81956989, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84199965, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.55350399017334 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01108971, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00063396, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 3.3563169676700384, + "language_loss": 0.7394371, + "learning_rate": 1.315248145768822e-06, + "loss": 0.76186752, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 2.54974627494812 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01109264, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00054502, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 1.9295543983445693, + "language_loss": 0.77169788, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.7943024, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.5059869289398193 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01108826, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.00067925, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.5674790544266248, + "language_loss": 0.67813671, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.70040482, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 3.971158742904663 + }, + { + "auxiliary_loss_clip": 0.01136349, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00056696, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 1.8598466023714555, + "language_loss": 0.67950672, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70196402, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.6300852298736572 + }, + { + "auxiliary_loss_clip": 0.01101027, + "auxiliary_loss_mlp": 0.01109312, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00049782, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.8486142813941844, + "language_loss": 0.86541688, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88752031, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.623694658279419 + }, + { + "auxiliary_loss_clip": 0.0112892, + "auxiliary_loss_mlp": 0.01087375, + "balance_loss_clip": 1.00123811, + "balance_loss_mlp": 1.000018, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8991620831170264, + "language_loss": 0.60831475, + "learning_rate": 1.313418851605015e-06, + "loss": 0.63047767, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.1504738330841064 + }, + { + "auxiliary_loss_clip": 0.01117074, + "auxiliary_loss_mlp": 0.00747825, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00089395, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 1.8504705723607984, + "language_loss": 0.75363004, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77227902, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.627657651901245 + }, + { + "auxiliary_loss_clip": 0.01149987, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_clip": 1.00199509, + "balance_loss_mlp": 1.0006721, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.075972551627585, + "language_loss": 0.76398665, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78658706, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.558459758758545 + }, + { + "auxiliary_loss_clip": 0.01151607, + "auxiliary_loss_mlp": 0.01109158, + "balance_loss_clip": 1.00205004, + "balance_loss_mlp": 1.00062966, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4823494883133213, + "language_loss": 0.78480196, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80740958, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.6040968894958496 + }, + { + "auxiliary_loss_clip": 0.01087236, + "auxiliary_loss_mlp": 0.01109432, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00052214, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.8498615424536207, + "language_loss": 0.68268704, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70465374, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.7247202396392822 + }, + { + "auxiliary_loss_clip": 0.01166478, + "auxiliary_loss_mlp": 0.01110164, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00058651, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.034243924724169, + "language_loss": 0.87766957, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90043598, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.4574387073516846 + }, + { + "auxiliary_loss_clip": 0.01166437, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00053847, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.4965621720974605, + "language_loss": 0.66175741, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68451244, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.611809015274048 + }, + { + "auxiliary_loss_clip": 0.01149594, + "auxiliary_loss_mlp": 0.01108441, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00058031, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3412442705714644, + "language_loss": 0.77911806, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.80169845, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.656712532043457 + }, + { + "auxiliary_loss_clip": 0.01150061, + "auxiliary_loss_mlp": 0.01109637, + "balance_loss_clip": 1.00186956, + "balance_loss_mlp": 1.00044096, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.5007757970372129, + "language_loss": 0.77409106, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79668808, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.591703414916992 + }, + { + "auxiliary_loss_clip": 0.01149195, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00052178, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.5467786010534583, + "language_loss": 0.69726771, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71984732, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.574096918106079 + }, + { + "auxiliary_loss_clip": 0.01137017, + "auxiliary_loss_mlp": 0.01109042, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00051379, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.6655858368115775, + "language_loss": 0.77150381, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79396439, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 2.57651686668396 + }, + { + "auxiliary_loss_clip": 0.01133203, + "auxiliary_loss_mlp": 0.01108455, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00059414, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.4534243134369145, + "language_loss": 0.70088953, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72330606, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.701789617538452 + }, + { + "auxiliary_loss_clip": 0.01119176, + "auxiliary_loss_mlp": 0.01109857, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.0006609, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.5972595593469159, + "language_loss": 0.76447272, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78676307, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.6514906883239746 + }, + { + "auxiliary_loss_clip": 0.01133101, + "auxiliary_loss_mlp": 0.01108982, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00045383, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.5723545586605976, + "language_loss": 0.67967498, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70209575, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.59222412109375 + }, + { + "auxiliary_loss_clip": 0.01135102, + "auxiliary_loss_mlp": 0.01109307, + "balance_loss_clip": 1.00168681, + "balance_loss_mlp": 1.00049233, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7811701633975916, + "language_loss": 0.76629031, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78873444, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.602912187576294 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01109429, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00042439, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.4423234446836122, + "language_loss": 0.79494792, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.8173821, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.6806657314300537 + }, + { + "auxiliary_loss_clip": 0.01150008, + "auxiliary_loss_mlp": 0.01108244, + "balance_loss_clip": 1.0019151, + "balance_loss_mlp": 1.00066948, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.4714013526801564, + "language_loss": 0.79799849, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82058096, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.539910316467285 + }, + { + "auxiliary_loss_clip": 0.01134702, + "auxiliary_loss_mlp": 0.0110967, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00056946, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 1.998158646932322, + "language_loss": 0.75122666, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.77367038, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.5527985095977783 + }, + { + "auxiliary_loss_clip": 0.01149556, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00048161, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.3286226612168712, + "language_loss": 0.78504431, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80762619, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.5863118171691895 + }, + { + "auxiliary_loss_clip": 0.01121922, + "auxiliary_loss_mlp": 0.01108121, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00045109, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 2.0183649641643404, + "language_loss": 0.75051999, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77282047, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 4.047317743301392 + }, + { + "auxiliary_loss_clip": 0.01136209, + "auxiliary_loss_mlp": 0.01110334, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00047088, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 1.678883178088497, + "language_loss": 0.66461509, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.6870805, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.592794895172119 + }, + { + "auxiliary_loss_clip": 0.01132606, + "auxiliary_loss_mlp": 0.01086971, + "balance_loss_clip": 1.0012486, + "balance_loss_mlp": 0.99999619, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7525698926896697, + "language_loss": 0.61999011, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64218593, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.1877660751342773 + }, + { + "auxiliary_loss_clip": 0.01149662, + "auxiliary_loss_mlp": 0.01109147, + "balance_loss_clip": 1.00187266, + "balance_loss_mlp": 1.00042796, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 3.594190757564105, + "language_loss": 0.71977758, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74236572, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 2.5573506355285645 + }, + { + "auxiliary_loss_clip": 0.01150209, + "auxiliary_loss_mlp": 0.01110778, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00062883, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.4344654104953753, + "language_loss": 0.65362424, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67623413, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.6007025241851807 + }, + { + "auxiliary_loss_clip": 0.01115848, + "auxiliary_loss_mlp": 0.01108962, + "balance_loss_clip": 1.00166821, + "balance_loss_mlp": 1.00043416, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.6893901809577723, + "language_loss": 0.79297954, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81522763, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.58537220954895 + }, + { + "auxiliary_loss_clip": 0.01134725, + "auxiliary_loss_mlp": 0.01109215, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00040054, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.7074309751910155, + "language_loss": 0.60519338, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62763286, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.6046175956726074 + }, + { + "auxiliary_loss_clip": 0.01135133, + "auxiliary_loss_mlp": 0.01109483, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00047755, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.885420080151179, + "language_loss": 0.77449846, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79694468, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.6217663288116455 + }, + { + "auxiliary_loss_clip": 0.01133117, + "auxiliary_loss_mlp": 0.01109647, + "balance_loss_clip": 1.00187719, + "balance_loss_mlp": 1.00045156, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.516682063683264, + "language_loss": 0.64576173, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66818941, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.7722933292388916 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.01109729, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00062835, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.8826457596391053, + "language_loss": 0.7700581, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.79249251, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.60298490524292 + }, + { + "auxiliary_loss_clip": 0.01117563, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00081038, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.8800608389300415, + "language_loss": 0.82375056, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84240299, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 4.011220455169678 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01109631, + "balance_loss_clip": 1.0018568, + "balance_loss_mlp": 1.00062585, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.6202730760632833, + "language_loss": 0.75039941, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77284157, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 2.546644687652588 + }, + { + "auxiliary_loss_clip": 0.01136274, + "auxiliary_loss_mlp": 0.01110448, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00068009, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 4.041794386559212, + "language_loss": 0.72591525, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74838245, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 4.014496564865112 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.01108949, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.0005163, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.6079410615730598, + "language_loss": 0.7591809, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.78142941, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.6416962146759033 + }, + { + "auxiliary_loss_clip": 0.01132987, + "auxiliary_loss_mlp": 0.01108718, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00057173, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 1.845295487348025, + "language_loss": 0.7500267, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77244377, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.6263749599456787 + }, + { + "auxiliary_loss_clip": 0.01166389, + "auxiliary_loss_mlp": 0.01109008, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.00048018, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7570806515031825, + "language_loss": 0.74307024, + "learning_rate": 1.300997001489483e-06, + "loss": 0.7658242, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.5552546977996826 + }, + { + "auxiliary_loss_clip": 0.01118622, + "auxiliary_loss_mlp": 0.01109619, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.0006144, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.9509507321340913, + "language_loss": 0.74450839, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76679087, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 4.0617899894714355 + }, + { + "auxiliary_loss_clip": 0.01117734, + "auxiliary_loss_mlp": 0.01087724, + "balance_loss_clip": 1.0012393, + "balance_loss_mlp": 0.99998611, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8267011020378726, + "language_loss": 0.56445932, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58651394, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.256363868713379 + }, + { + "auxiliary_loss_clip": 0.01150832, + "auxiliary_loss_mlp": 0.01109479, + "balance_loss_clip": 1.00188231, + "balance_loss_mlp": 1.00047386, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.2656069142627673, + "language_loss": 0.82757378, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85017693, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.53501033782959 + }, + { + "auxiliary_loss_clip": 0.01065012, + "auxiliary_loss_mlp": 0.01108998, + "balance_loss_clip": 1.00164449, + "balance_loss_mlp": 1.00056529, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 2.3024470933193233, + "language_loss": 0.69527316, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71701324, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.7939629554748535 + }, + { + "auxiliary_loss_clip": 0.01119613, + "auxiliary_loss_mlp": 0.01109053, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00052452, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.7052815635725207, + "language_loss": 0.71787012, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74015677, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 2.664508104324341 + }, + { + "auxiliary_loss_clip": 0.01102888, + "auxiliary_loss_mlp": 0.01110019, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.0005374, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.7472712410745257, + "language_loss": 0.69729275, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71942186, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.668473243713379 + }, + { + "auxiliary_loss_clip": 0.01135201, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_clip": 1.0019666, + "balance_loss_mlp": 1.00064242, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.6081265203263868, + "language_loss": 0.79071808, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81316751, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.5925323963165283 + }, + { + "auxiliary_loss_clip": 0.01117241, + "auxiliary_loss_mlp": 0.01108467, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00051093, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.69190808196278, + "language_loss": 0.68913162, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71138871, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.68851375579834 + }, + { + "auxiliary_loss_clip": 0.01151401, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.000808, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.629344630525258, + "language_loss": 0.85571253, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87470245, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.6020989418029785 + }, + { + "auxiliary_loss_clip": 0.01134272, + "auxiliary_loss_mlp": 0.00747592, + "balance_loss_clip": 1.00181901, + "balance_loss_mlp": 1.00077391, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.809382158607361, + "language_loss": 0.79968381, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81850243, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.6000759601593018 + }, + { + "auxiliary_loss_clip": 0.01134663, + "auxiliary_loss_mlp": 0.01108741, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00049949, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.067823610183919, + "language_loss": 0.69605619, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71849024, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.584273338317871 + }, + { + "auxiliary_loss_clip": 0.01104699, + "auxiliary_loss_mlp": 0.01107685, + "balance_loss_clip": 1.00184691, + "balance_loss_mlp": 1.0003016, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.6940595541969696, + "language_loss": 0.67483371, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69695753, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.683784246444702 + }, + { + "auxiliary_loss_clip": 0.01101283, + "auxiliary_loss_mlp": 0.01109906, + "balance_loss_clip": 1.00150585, + "balance_loss_mlp": 1.0006144, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.5446960786331567, + "language_loss": 0.69719112, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71930301, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.705108165740967 + }, + { + "auxiliary_loss_clip": 0.01134845, + "auxiliary_loss_mlp": 0.01109549, + "balance_loss_clip": 1.00170457, + "balance_loss_mlp": 1.00054359, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.5222243953096972, + "language_loss": 0.69310725, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.7155512, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.6273982524871826 + }, + { + "auxiliary_loss_clip": 0.01118178, + "auxiliary_loss_mlp": 0.01110132, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00045967, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 3.847452339079232, + "language_loss": 0.80911309, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83139622, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.588932514190674 + }, + { + "auxiliary_loss_clip": 0.0114979, + "auxiliary_loss_mlp": 0.01109899, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00060773, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.704572098427475, + "language_loss": 0.74751043, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.77010739, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.548053026199341 + }, + { + "auxiliary_loss_clip": 0.01101518, + "auxiliary_loss_mlp": 0.0110957, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.7855748832736384, + "language_loss": 0.73944134, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76155221, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.6812219619750977 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.01107893, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.0005089, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.5547309829974685, + "language_loss": 0.84414607, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86655605, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.637885332107544 + }, + { + "auxiliary_loss_clip": 0.01151743, + "auxiliary_loss_mlp": 0.01108681, + "balance_loss_clip": 1.00206923, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 1.9104945922560683, + "language_loss": 0.57279843, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59540266, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.5322818756103516 + }, + { + "auxiliary_loss_clip": 0.01151714, + "auxiliary_loss_mlp": 0.01109729, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00062823, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.810976275834956, + "language_loss": 0.8465597, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86917412, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.540519952774048 + }, + { + "auxiliary_loss_clip": 0.0116659, + "auxiliary_loss_mlp": 0.01109763, + "balance_loss_clip": 1.00216997, + "balance_loss_mlp": 1.00056696, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.3912207374873427, + "language_loss": 0.64650637, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66926992, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.5411536693573 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01110353, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00067997, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 2.3413605590755657, + "language_loss": 0.85705334, + "learning_rate": 1.292975627485741e-06, + "loss": 0.87934101, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 2.6080312728881836 + }, + { + "auxiliary_loss_clip": 0.01119822, + "auxiliary_loss_mlp": 0.01109055, + "balance_loss_clip": 1.00192821, + "balance_loss_mlp": 1.00052714, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.8964659611271886, + "language_loss": 0.79411107, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81639981, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 4.029110431671143 + }, + { + "auxiliary_loss_clip": 0.01151453, + "auxiliary_loss_mlp": 0.01108483, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00033677, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.0932268602886888, + "language_loss": 0.74531823, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76791763, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.57326078414917 + }, + { + "auxiliary_loss_clip": 0.01166377, + "auxiliary_loss_mlp": 0.01108064, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00039458, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.8182592574149272, + "language_loss": 0.77722383, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79996824, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.4675543308258057 + }, + { + "auxiliary_loss_clip": 0.01166509, + "auxiliary_loss_mlp": 0.01108752, + "balance_loss_clip": 1.0021379, + "balance_loss_mlp": 1.00070047, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.6931866834185338, + "language_loss": 0.69143623, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71418887, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.5562047958374023 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01107279, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00056267, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5642883679758248, + "language_loss": 0.74190593, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76432085, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.610105037689209 + }, + { + "auxiliary_loss_clip": 0.01150746, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00079381, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.3457307758945116, + "language_loss": 0.80382746, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82281077, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.5899081230163574 + }, + { + "auxiliary_loss_clip": 0.01118369, + "auxiliary_loss_mlp": 0.0110925, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00062633, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.8179300370048967, + "language_loss": 0.68347371, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70574987, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 2.5945565700531006 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01108946, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00079989, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.7460975215787409, + "language_loss": 0.71772093, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.74001229, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.6120245456695557 + }, + { + "auxiliary_loss_clip": 0.01151248, + "auxiliary_loss_mlp": 0.01108955, + "balance_loss_clip": 1.00197959, + "balance_loss_mlp": 1.00052202, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.473063179329056, + "language_loss": 0.79949415, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82209623, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.5611343383789062 + }, + { + "auxiliary_loss_clip": 0.01162105, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_clip": 1.00128722, + "balance_loss_mlp": 0.99999869, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7718921937419927, + "language_loss": 0.59191906, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61441368, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 4.575185537338257 + }, + { + "auxiliary_loss_clip": 0.01128478, + "auxiliary_loss_mlp": 0.01087357, + "balance_loss_clip": 1.00124371, + "balance_loss_mlp": 1.00000024, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.88270669138211, + "language_loss": 0.63802248, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.66018081, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.202516555786133 + }, + { + "auxiliary_loss_clip": 0.01132928, + "auxiliary_loss_mlp": 0.01108548, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00059223, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.782944053395441, + "language_loss": 0.65119147, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67360622, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 4.032986402511597 + }, + { + "auxiliary_loss_clip": 0.01149688, + "auxiliary_loss_mlp": 0.01109276, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00055683, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.8375048516739594, + "language_loss": 0.61827242, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64086199, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.504258871078491 + }, + { + "auxiliary_loss_clip": 0.01115973, + "auxiliary_loss_mlp": 0.01108349, + "balance_loss_clip": 1.00160158, + "balance_loss_mlp": 1.00048828, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.5132620399697998, + "language_loss": 0.84432626, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86656952, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 2.6171021461486816 + }, + { + "auxiliary_loss_clip": 0.01162071, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_clip": 1.00125289, + "balance_loss_mlp": 0.99997443, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7348637078588569, + "language_loss": 0.61586827, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63836229, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.083772659301758 + }, + { + "auxiliary_loss_clip": 0.0113357, + "auxiliary_loss_mlp": 0.01109038, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00060534, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.4691836028053922, + "language_loss": 0.77578121, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79820734, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 4.027674674987793 + }, + { + "auxiliary_loss_clip": 0.01145569, + "auxiliary_loss_mlp": 0.01086976, + "balance_loss_clip": 1.00132656, + "balance_loss_mlp": 1.00000048, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7238192063811667, + "language_loss": 0.54346877, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56579423, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.060964822769165 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01108585, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.00072503, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.6219394728494254, + "language_loss": 0.84282672, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86493027, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.703933000564575 + }, + { + "auxiliary_loss_clip": 0.01116344, + "auxiliary_loss_mlp": 0.01109616, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00070643, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.113745984126093, + "language_loss": 0.80231631, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8245759, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.61976957321167 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_clip": 1.00169206, + "balance_loss_mlp": 1.00037575, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.3754671119397623, + "language_loss": 0.74526525, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76734841, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.7020602226257324 + }, + { + "auxiliary_loss_clip": 0.01151737, + "auxiliary_loss_mlp": 0.01107837, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00045383, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.657562215217609, + "language_loss": 0.72695756, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74955332, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.5410051345825195 + }, + { + "auxiliary_loss_clip": 0.01120284, + "auxiliary_loss_mlp": 0.01108827, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00058484, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.4933836668083176, + "language_loss": 0.71422458, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73651564, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.669759750366211 + }, + { + "auxiliary_loss_clip": 0.01166307, + "auxiliary_loss_mlp": 0.0110824, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00057054, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 2.5541278593203844, + "language_loss": 0.7309444, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75368989, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.5337586402893066 + }, + { + "auxiliary_loss_clip": 0.01118743, + "auxiliary_loss_mlp": 0.01108037, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.00046265, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.7899583028818455, + "language_loss": 0.72533685, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74760467, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.6173295974731445 + }, + { + "auxiliary_loss_clip": 0.01149483, + "auxiliary_loss_mlp": 0.01107806, + "balance_loss_clip": 1.00195706, + "balance_loss_mlp": 1.00042224, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.5322903666604262, + "language_loss": 0.68875712, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71133, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.5736398696899414 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00059628, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 2.3772772690269703, + "language_loss": 0.73583221, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75796676, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.642551898956299 + }, + { + "auxiliary_loss_clip": 0.01146616, + "auxiliary_loss_mlp": 0.01086965, + "balance_loss_clip": 1.00125635, + "balance_loss_mlp": 0.99999017, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6777274985191141, + "language_loss": 0.52400196, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54633772, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 2.99746036529541 + }, + { + "auxiliary_loss_clip": 0.01134636, + "auxiliary_loss_mlp": 0.01109006, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00057316, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.0126715189697695, + "language_loss": 0.91406602, + "learning_rate": 1.282785392633079e-06, + "loss": 0.9365024, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.549241304397583 + }, + { + "auxiliary_loss_clip": 0.0116628, + "auxiliary_loss_mlp": 0.01108207, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00044179, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.924784638706964, + "language_loss": 0.60005963, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62280452, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.701677083969116 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00058043, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5296943267192031, + "language_loss": 0.7680465, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79047132, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.5725932121276855 + }, + { + "auxiliary_loss_clip": 0.01134911, + "auxiliary_loss_mlp": 0.01109313, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00059462, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.5805771977775922, + "language_loss": 0.77171743, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79415965, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.5879900455474854 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01107689, + "balance_loss_clip": 1.0017544, + "balance_loss_mlp": 1.00059187, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.5925512414946184, + "language_loss": 0.72564477, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74790782, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.6517834663391113 + }, + { + "auxiliary_loss_clip": 0.0110354, + "auxiliary_loss_mlp": 0.01108679, + "balance_loss_clip": 1.00159216, + "balance_loss_mlp": 1.00043702, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.707192628365243, + "language_loss": 0.80389416, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82601643, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.6208138465881348 + }, + { + "auxiliary_loss_clip": 0.01118398, + "auxiliary_loss_mlp": 0.0110847, + "balance_loss_clip": 1.00177145, + "balance_loss_mlp": 1.0006094, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.8125634918864832, + "language_loss": 0.81759709, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.83986574, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.6376707553863525 + }, + { + "auxiliary_loss_clip": 0.01104913, + "auxiliary_loss_mlp": 0.00747594, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00078666, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.4480298434461527, + "language_loss": 0.81702542, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83555043, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.7063443660736084 + }, + { + "auxiliary_loss_clip": 0.01132112, + "auxiliary_loss_mlp": 0.01109485, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00048041, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5392164802400625, + "language_loss": 0.72266924, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74508512, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.6072702407836914 + }, + { + "auxiliary_loss_clip": 0.01151519, + "auxiliary_loss_mlp": 0.01108964, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00053084, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 3.6171833399684448, + "language_loss": 0.79546142, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81806624, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.552156448364258 + }, + { + "auxiliary_loss_clip": 0.01150039, + "auxiliary_loss_mlp": 0.01108803, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00056076, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.5221002928817007, + "language_loss": 0.61202651, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63461494, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 4.060155630111694 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01107723, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00072026, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.6226804549621268, + "language_loss": 0.7851423, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80754375, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.6032326221466064 + }, + { + "auxiliary_loss_clip": 0.01120369, + "auxiliary_loss_mlp": 0.01108013, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.00043809, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.722222385214835, + "language_loss": 0.74050152, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76278532, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.578308582305908 + }, + { + "auxiliary_loss_clip": 0.01134814, + "auxiliary_loss_mlp": 0.01108397, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.0005362, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.9096643475088126, + "language_loss": 0.70096987, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72340202, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.594143867492676 + }, + { + "auxiliary_loss_clip": 0.01166113, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00042427, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.7609662691106494, + "language_loss": 0.72166169, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74438375, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.5588722229003906 + }, + { + "auxiliary_loss_clip": 0.01133908, + "auxiliary_loss_mlp": 0.01107943, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00065506, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.6827525202267322, + "language_loss": 0.72585112, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74826968, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.567147731781006 + }, + { + "auxiliary_loss_clip": 0.01131389, + "auxiliary_loss_mlp": 0.01108101, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.00043118, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.5746506377360454, + "language_loss": 0.69477212, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71716708, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.531809091567993 + }, + { + "auxiliary_loss_clip": 0.01145948, + "auxiliary_loss_mlp": 0.01087039, + "balance_loss_clip": 1.00125742, + "balance_loss_mlp": 1.00006354, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.9214326048369453, + "language_loss": 0.59748715, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61981702, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.2242965698242188 + }, + { + "auxiliary_loss_clip": 0.01117302, + "auxiliary_loss_mlp": 0.01107542, + "balance_loss_clip": 1.00155962, + "balance_loss_mlp": 1.00034869, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 2.0065557983864055, + "language_loss": 0.65033525, + "learning_rate": 1.276245767820154e-06, + "loss": 0.6725837, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.7495133876800537 + }, + { + "auxiliary_loss_clip": 0.01129946, + "auxiliary_loss_mlp": 0.01086673, + "balance_loss_clip": 1.0012188, + "balance_loss_mlp": 1.00007915, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7958901559305506, + "language_loss": 0.56892669, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.59109282, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 2.9118175506591797 + }, + { + "auxiliary_loss_clip": 0.01098703, + "auxiliary_loss_mlp": 0.01086984, + "balance_loss_clip": 1.00107741, + "balance_loss_mlp": 1.00000858, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7313710853524086, + "language_loss": 0.57922703, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60108393, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 4.575127363204956 + }, + { + "auxiliary_loss_clip": 0.01145816, + "auxiliary_loss_mlp": 0.01087305, + "balance_loss_clip": 1.00156474, + "balance_loss_mlp": 1.00032997, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6809007043940237, + "language_loss": 0.52102208, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54335332, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.1398239135742188 + }, + { + "auxiliary_loss_clip": 0.011493, + "auxiliary_loss_mlp": 0.011074, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00058818, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.747057176538105, + "language_loss": 0.74365985, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76622689, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 4.161880254745483 + }, + { + "auxiliary_loss_clip": 0.01133235, + "auxiliary_loss_mlp": 0.01108955, + "balance_loss_clip": 1.00203192, + "balance_loss_mlp": 1.00052202, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 2.09336914847976, + "language_loss": 0.62884343, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65126532, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 2.551543951034546 + }, + { + "auxiliary_loss_clip": 0.01166499, + "auxiliary_loss_mlp": 0.01108775, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00053334, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 2.6143868875903618, + "language_loss": 0.69509679, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71784955, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.5326926708221436 + }, + { + "auxiliary_loss_clip": 0.01134959, + "auxiliary_loss_mlp": 0.01107758, + "balance_loss_clip": 1.00191271, + "balance_loss_mlp": 1.00056458, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.659341347174002, + "language_loss": 0.74772966, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.77015674, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.5773141384124756 + }, + { + "auxiliary_loss_clip": 0.01134935, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.0009253, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 2.924647075655656, + "language_loss": 0.65996009, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.6787855, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 4.083176612854004 + }, + { + "auxiliary_loss_clip": 0.01099213, + "auxiliary_loss_mlp": 0.0110746, + "balance_loss_clip": 1.00160944, + "balance_loss_mlp": 1.00055361, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.8501412204838186, + "language_loss": 0.90231442, + "learning_rate": 1.272979284940101e-06, + "loss": 0.9243812, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.606614828109741 + }, + { + "auxiliary_loss_clip": 0.01166227, + "auxiliary_loss_mlp": 0.01107692, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00059462, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.789923615739185, + "language_loss": 0.75136518, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77410436, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.5044291019439697 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00051177, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 2.1227014027832474, + "language_loss": 0.70397389, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72654939, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 2.53302264213562 + }, + { + "auxiliary_loss_clip": 0.01151734, + "auxiliary_loss_mlp": 0.01108614, + "balance_loss_clip": 1.00195229, + "balance_loss_mlp": 1.00046742, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.4198236843871688, + "language_loss": 0.67358363, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69618714, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.505911111831665 + }, + { + "auxiliary_loss_clip": 0.01134584, + "auxiliary_loss_mlp": 0.00747537, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00083542, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.5981108506141386, + "language_loss": 0.73682296, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.7556442, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.59560227394104 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01108682, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00053513, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 1.8023918886865502, + "language_loss": 0.78684825, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.80943155, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.522726535797119 + }, + { + "auxiliary_loss_clip": 0.01132386, + "auxiliary_loss_mlp": 0.01086937, + "balance_loss_clip": 1.00125718, + "balance_loss_mlp": 0.99996203, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8864563770729368, + "language_loss": 0.61740994, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63960314, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.9230735301971436 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01108671, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00052452, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.0599739912652755, + "language_loss": 0.82725215, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.84983689, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.576798677444458 + }, + { + "auxiliary_loss_clip": 0.01149712, + "auxiliary_loss_mlp": 0.01107626, + "balance_loss_clip": 1.00198042, + "balance_loss_mlp": 1.00043273, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.553406352262022, + "language_loss": 0.72430634, + "learning_rate": 1.270077618961487e-06, + "loss": 0.7468797, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.5911810398101807 + }, + { + "auxiliary_loss_clip": 0.01118235, + "auxiliary_loss_mlp": 0.01108376, + "balance_loss_clip": 1.00168633, + "balance_loss_mlp": 1.00041997, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.6030852443398704, + "language_loss": 0.74225879, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76452494, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.6718811988830566 + }, + { + "auxiliary_loss_clip": 0.01133195, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00089264, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9329645148360306, + "language_loss": 0.81169236, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83050042, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.6401171684265137 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_clip": 1.00191319, + "balance_loss_mlp": 1.0005157, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.725338291926834, + "language_loss": 0.63602209, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65842438, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.55078125 + }, + { + "auxiliary_loss_clip": 0.01166302, + "auxiliary_loss_mlp": 0.01108329, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00075436, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.4507490846711564, + "language_loss": 0.67091817, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69366443, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.5458827018737793 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01108145, + "balance_loss_clip": 1.0017612, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.6553305443843924, + "language_loss": 0.67529476, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69771415, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.584390640258789 + }, + { + "auxiliary_loss_clip": 0.01118255, + "auxiliary_loss_mlp": 0.01109548, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00054312, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.7717098697104847, + "language_loss": 0.69372195, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71599996, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.6213607788085938 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.0110883, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.00068367, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 1.9504525317888657, + "language_loss": 0.78204578, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80448604, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 2.5937795639038086 + }, + { + "auxiliary_loss_clip": 0.01134758, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00048804, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.6881123041909307, + "language_loss": 0.55819356, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58062273, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.610738515853882 + }, + { + "auxiliary_loss_clip": 0.0116637, + "auxiliary_loss_mlp": 0.01107955, + "balance_loss_clip": 1.00197923, + "balance_loss_mlp": 1.00047576, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.6887704067392504, + "language_loss": 0.64417166, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66691494, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.5015804767608643 + }, + { + "auxiliary_loss_clip": 0.01117403, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_clip": 1.0016315, + "balance_loss_mlp": 1.00043964, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.4061696513164486, + "language_loss": 0.82620692, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84846115, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.639143705368042 + }, + { + "auxiliary_loss_clip": 0.01135878, + "auxiliary_loss_mlp": 0.01108174, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00050449, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 2.003467435690845, + "language_loss": 0.79261756, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81505811, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.7379119396209717 + }, + { + "auxiliary_loss_clip": 0.01135187, + "auxiliary_loss_mlp": 0.01108704, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00055766, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.7972657162574497, + "language_loss": 0.70431125, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72675014, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.549379587173462 + }, + { + "auxiliary_loss_clip": 0.01134797, + "auxiliary_loss_mlp": 0.01109199, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00067067, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.3256995806934593, + "language_loss": 0.80080163, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82324159, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 3.9227042198181152 + }, + { + "auxiliary_loss_clip": 0.01135353, + "auxiliary_loss_mlp": 0.01107894, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.0006057, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8255423458608673, + "language_loss": 0.74127245, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76370496, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.5642778873443604 + }, + { + "auxiliary_loss_clip": 0.0115133, + "auxiliary_loss_mlp": 0.01108013, + "balance_loss_clip": 1.00183976, + "balance_loss_mlp": 1.00053346, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 2.172335656758392, + "language_loss": 0.69904149, + "learning_rate": 1.264641775364217e-06, + "loss": 0.72163492, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.538914203643799 + }, + { + "auxiliary_loss_clip": 0.01150677, + "auxiliary_loss_mlp": 0.01107919, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00063062, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.8953250315694328, + "language_loss": 0.6967842, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.71937013, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.6394824981689453 + }, + { + "auxiliary_loss_clip": 0.01166312, + "auxiliary_loss_mlp": 0.01107949, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00056481, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.7173520001535842, + "language_loss": 0.74091566, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76365829, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.5694687366485596 + }, + { + "auxiliary_loss_clip": 0.01149352, + "auxiliary_loss_mlp": 0.00747569, + "balance_loss_clip": 1.00184691, + "balance_loss_mlp": 1.00077057, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.6015332730699985, + "language_loss": 0.75366527, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77263451, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.5688862800598145 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01109627, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00081301, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 1.9297722183263504, + "language_loss": 0.85440993, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87700391, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.542370319366455 + }, + { + "auxiliary_loss_clip": 0.01132298, + "auxiliary_loss_mlp": 0.01107883, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00049949, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.6458085166757348, + "language_loss": 0.86500782, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88740957, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.5631771087646484 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01109109, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00058079, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.5342853882505023, + "language_loss": 0.76243818, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78469282, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.633500576019287 + }, + { + "auxiliary_loss_clip": 0.01104694, + "auxiliary_loss_mlp": 0.01107966, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00048661, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 3.2620611729163733, + "language_loss": 0.82224268, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84436929, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.6924076080322266 + }, + { + "auxiliary_loss_clip": 0.01166325, + "auxiliary_loss_mlp": 0.01108256, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00058591, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.8404225267617003, + "language_loss": 0.74510974, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76785553, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 3.9280989170074463 + }, + { + "auxiliary_loss_clip": 0.01133049, + "auxiliary_loss_mlp": 0.01109258, + "balance_loss_clip": 1.00171554, + "balance_loss_mlp": 1.00063467, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.7977605714930438, + "language_loss": 0.67886209, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.70128512, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.6187026500701904 + }, + { + "auxiliary_loss_clip": 0.01121035, + "auxiliary_loss_mlp": 0.01108287, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.00052178, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.5529407582382477, + "language_loss": 0.70848596, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73077923, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 4.040446519851685 + }, + { + "auxiliary_loss_clip": 0.0114983, + "auxiliary_loss_mlp": 0.01108029, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00045419, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.6224797735591254, + "language_loss": 0.79391187, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81649053, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.5351357460021973 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.00747728, + "balance_loss_clip": 1.00172257, + "balance_loss_mlp": 1.00090671, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.3722244388056954, + "language_loss": 0.70493019, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72340268, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 2.69858455657959 + }, + { + "auxiliary_loss_clip": 0.01166273, + "auxiliary_loss_mlp": 0.01108146, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.00057197, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.6806034312982534, + "language_loss": 0.80048239, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82322657, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.504289150238037 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_clip": 1.00193655, + "balance_loss_mlp": 1.00057149, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.8906941972255595, + "language_loss": 0.70603931, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72861719, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 3.9793453216552734 + }, + { + "auxiliary_loss_clip": 0.01151865, + "auxiliary_loss_mlp": 0.01108496, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00044429, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.585574274961346, + "language_loss": 0.66121495, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68381852, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.5539917945861816 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01107364, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00045729, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 2.194616127336181, + "language_loss": 0.75164402, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.77393425, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.6573407649993896 + }, + { + "auxiliary_loss_clip": 0.01132528, + "auxiliary_loss_mlp": 0.0110675, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00041521, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.697359235033554, + "language_loss": 0.90057081, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.92296362, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.582536458969116 + }, + { + "auxiliary_loss_clip": 0.01166588, + "auxiliary_loss_mlp": 0.01110544, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00058579, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.939767552400306, + "language_loss": 0.81882721, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84159851, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.01082484, + "auxiliary_loss_mlp": 0.0110712, + "balance_loss_clip": 1.00155413, + "balance_loss_mlp": 1.00049913, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.5973893466836901, + "language_loss": 0.77440763, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79630357, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 2.6788249015808105 + }, + { + "auxiliary_loss_clip": 0.0114983, + "auxiliary_loss_mlp": 0.01108184, + "balance_loss_clip": 1.00185025, + "balance_loss_mlp": 1.0006094, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.5410538208772084, + "language_loss": 0.85073566, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87331575, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.578857421875 + }, + { + "auxiliary_loss_clip": 0.0113469, + "auxiliary_loss_mlp": 0.01107347, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00053561, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.4854585067444541, + "language_loss": 0.71789986, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74032027, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.6234235763549805 + }, + { + "auxiliary_loss_clip": 0.0114967, + "auxiliary_loss_mlp": 0.0110778, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00049138, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.928751460763176, + "language_loss": 0.71810633, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.74068081, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.5359814167022705 + }, + { + "auxiliary_loss_clip": 0.01120191, + "auxiliary_loss_mlp": 0.01108541, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00048983, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.8076062647690816, + "language_loss": 0.71857005, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74085736, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.615386962890625 + }, + { + "auxiliary_loss_clip": 0.0110096, + "auxiliary_loss_mlp": 0.01107303, + "balance_loss_clip": 1.00163817, + "balance_loss_mlp": 1.00049162, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.727315840174996, + "language_loss": 0.82008874, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84217137, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.643834352493286 + }, + { + "auxiliary_loss_clip": 0.01149538, + "auxiliary_loss_mlp": 0.01108415, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.0005548, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.3023168102824823, + "language_loss": 0.73829043, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76086992, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.5413401126861572 + }, + { + "auxiliary_loss_clip": 0.0113469, + "auxiliary_loss_mlp": 0.01109394, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00067508, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 1.9502460004973456, + "language_loss": 0.84167361, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.8641144, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.6372861862182617 + }, + { + "auxiliary_loss_clip": 0.01136377, + "auxiliary_loss_mlp": 0.01107898, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00032389, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.7742996529012665, + "language_loss": 0.66678834, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68923116, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.558330774307251 + }, + { + "auxiliary_loss_clip": 0.01149303, + "auxiliary_loss_mlp": 0.01108903, + "balance_loss_clip": 1.00196946, + "balance_loss_mlp": 1.00056529, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.8064551582408044, + "language_loss": 0.73510265, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75768471, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 2.5705573558807373 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00047147, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.0047756478607037, + "language_loss": 0.71917659, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74174368, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.5329298973083496 + }, + { + "auxiliary_loss_clip": 0.0114948, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_clip": 1.00193632, + "balance_loss_mlp": 1.000453, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.047238047425115, + "language_loss": 0.66930318, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.6918782, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.5010876655578613 + }, + { + "auxiliary_loss_clip": 0.0115193, + "auxiliary_loss_mlp": 0.01109259, + "balance_loss_clip": 1.00212228, + "balance_loss_mlp": 1.00044465, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 7.626464017162687, + "language_loss": 0.74849248, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77110434, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.5305769443511963 + }, + { + "auxiliary_loss_clip": 0.01149759, + "auxiliary_loss_mlp": 0.00747459, + "balance_loss_clip": 1.00207067, + "balance_loss_mlp": 1.00078917, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.520858301636995, + "language_loss": 0.73949051, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75846267, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.567229986190796 + }, + { + "auxiliary_loss_clip": 0.01116343, + "auxiliary_loss_mlp": 0.01107506, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.3474550314318856, + "language_loss": 0.79551709, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81775558, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.589972734451294 + }, + { + "auxiliary_loss_clip": 0.01149671, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00043452, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5448681374165256, + "language_loss": 0.74397659, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76654017, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.5554966926574707 + }, + { + "auxiliary_loss_clip": 0.01132712, + "auxiliary_loss_mlp": 0.01109468, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00074935, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.2537671148567417, + "language_loss": 0.77148104, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79390287, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.5425705909729004 + }, + { + "auxiliary_loss_clip": 0.01121255, + "auxiliary_loss_mlp": 0.01108093, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.0005182, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.424549946906958, + "language_loss": 0.85302079, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87531435, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 4.056253910064697 + }, + { + "auxiliary_loss_clip": 0.01151556, + "auxiliary_loss_mlp": 0.01108098, + "balance_loss_clip": 1.00193107, + "balance_loss_mlp": 1.00042868, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.8388554349846178, + "language_loss": 0.7652334, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78783, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.5756640434265137 + }, + { + "auxiliary_loss_clip": 0.01151425, + "auxiliary_loss_mlp": 0.01108255, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.00048971, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.5349790379298032, + "language_loss": 0.59884721, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62144399, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.588606119155884 + }, + { + "auxiliary_loss_clip": 0.01112556, + "auxiliary_loss_mlp": 0.01087499, + "balance_loss_clip": 1.00117683, + "balance_loss_mlp": 1.00014257, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7701127427061102, + "language_loss": 0.52418905, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54618961, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.2815561294555664 + }, + { + "auxiliary_loss_clip": 0.0113335, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00056136, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.75411753904595, + "language_loss": 0.83381522, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85624439, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.5863966941833496 + }, + { + "auxiliary_loss_clip": 0.01135211, + "auxiliary_loss_mlp": 0.01108553, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00040674, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.629400052495302, + "language_loss": 0.86811495, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.89055252, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.646186590194702 + }, + { + "auxiliary_loss_clip": 0.01132779, + "auxiliary_loss_mlp": 0.01107999, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.000615, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 4.06093110975957, + "language_loss": 0.72565532, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74806309, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.6342062950134277 + }, + { + "auxiliary_loss_clip": 0.0114977, + "auxiliary_loss_mlp": 0.01109173, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00045419, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.201590709163435, + "language_loss": 0.84454495, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86713439, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.6780965328216553 + }, + { + "auxiliary_loss_clip": 0.01149616, + "auxiliary_loss_mlp": 0.01107567, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.0003736, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.5951241282012611, + "language_loss": 0.77622116, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79879296, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.510303020477295 + }, + { + "auxiliary_loss_clip": 0.01099362, + "auxiliary_loss_mlp": 0.01106525, + "balance_loss_clip": 1.00160277, + "balance_loss_mlp": 1.00057173, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.5741645264553892, + "language_loss": 0.73383117, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75589001, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.6896839141845703 + }, + { + "auxiliary_loss_clip": 0.01119697, + "auxiliary_loss_mlp": 0.01109735, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00063455, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.8681951798500278, + "language_loss": 0.68548083, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70777518, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 4.123550891876221 + }, + { + "auxiliary_loss_clip": 0.0113424, + "auxiliary_loss_mlp": 0.01107862, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00066853, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.0764514713172497, + "language_loss": 0.71295041, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73537135, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 2.573211908340454 + }, + { + "auxiliary_loss_clip": 0.01149435, + "auxiliary_loss_mlp": 0.01107554, + "balance_loss_clip": 1.0020448, + "balance_loss_mlp": 1.00055194, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.3513484582364237, + "language_loss": 0.78374076, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80631059, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 4.011856555938721 + }, + { + "auxiliary_loss_clip": 0.01104179, + "auxiliary_loss_mlp": 0.01109113, + "balance_loss_clip": 1.00171208, + "balance_loss_mlp": 1.00058508, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.6884019155339158, + "language_loss": 0.63379967, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65593255, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.640018939971924 + }, + { + "auxiliary_loss_clip": 0.01136989, + "auxiliary_loss_mlp": 0.01107963, + "balance_loss_clip": 1.00201106, + "balance_loss_mlp": 1.00057936, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.4614732339614134, + "language_loss": 0.62018257, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64263213, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.6078104972839355 + }, + { + "auxiliary_loss_clip": 0.01099307, + "auxiliary_loss_mlp": 0.01108153, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00067425, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.5011894528432081, + "language_loss": 0.73407501, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75614959, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 4.086817264556885 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.01086764, + "balance_loss_clip": 1.00087309, + "balance_loss_mlp": 1.00017011, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6870149050469931, + "language_loss": 0.57689202, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59888792, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.2068824768066406 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.01106573, + "balance_loss_clip": 1.0016526, + "balance_loss_mlp": 1.00052488, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.675764099311482, + "language_loss": 0.66940904, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69163704, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.632305860519409 + }, + { + "auxiliary_loss_clip": 0.011195, + "auxiliary_loss_mlp": 0.01109388, + "balance_loss_clip": 1.00183964, + "balance_loss_mlp": 1.00038314, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.6455241288198705, + "language_loss": 0.8209604, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84324932, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.6223554611206055 + }, + { + "auxiliary_loss_clip": 0.01149515, + "auxiliary_loss_mlp": 0.01109167, + "balance_loss_clip": 1.00191331, + "balance_loss_mlp": 1.00044847, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 3.8201137101841693, + "language_loss": 0.54715025, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.56973708, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.7257049083709717 + }, + { + "auxiliary_loss_clip": 0.01134285, + "auxiliary_loss_mlp": 0.01108578, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00043178, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 2.006148975621064, + "language_loss": 0.70890981, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.73133844, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.633283853530884 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_clip": 1.0012064, + "balance_loss_mlp": 1.00007236, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7727679394147715, + "language_loss": 0.55352426, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57570457, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.1528234481811523 + }, + { + "auxiliary_loss_clip": 0.01134538, + "auxiliary_loss_mlp": 0.01109266, + "balance_loss_clip": 1.00189126, + "balance_loss_mlp": 1.00054717, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.775111306524998, + "language_loss": 0.68536723, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70780528, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.755247116088867 + }, + { + "auxiliary_loss_clip": 0.01135002, + "auxiliary_loss_mlp": 0.01108245, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00057507, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.5932048020436422, + "language_loss": 0.70283282, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72526538, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 2.70390248298645 + }, + { + "auxiliary_loss_clip": 0.0113213, + "auxiliary_loss_mlp": 0.01107829, + "balance_loss_clip": 1.00176179, + "balance_loss_mlp": 1.00044465, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.5766005918842525, + "language_loss": 0.78382194, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80622149, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.698918104171753 + }, + { + "auxiliary_loss_clip": 0.01136773, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_clip": 1.00200927, + "balance_loss_mlp": 1.00051665, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.8738365132227817, + "language_loss": 0.67959952, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70205957, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.718071222305298 + }, + { + "auxiliary_loss_clip": 0.01134539, + "auxiliary_loss_mlp": 0.01108552, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00059605, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 11.18906776274839, + "language_loss": 0.77121979, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79365069, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.628023386001587 + }, + { + "auxiliary_loss_clip": 0.01132908, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00073171, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 1.7913139935958489, + "language_loss": 0.71917003, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74158508, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.6330251693725586 + }, + { + "auxiliary_loss_clip": 0.01149952, + "auxiliary_loss_mlp": 0.0110921, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00049162, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 4.25773751150453, + "language_loss": 0.80752552, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83011711, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.531968355178833 + }, + { + "auxiliary_loss_clip": 0.01118673, + "auxiliary_loss_mlp": 0.01108212, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00063777, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.041363760062841, + "language_loss": 0.80637157, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.82864034, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.6226754188537598 + }, + { + "auxiliary_loss_clip": 0.01134827, + "auxiliary_loss_mlp": 0.01108804, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00065708, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.5300716547764903, + "language_loss": 0.72577119, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74820745, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.6829030513763428 + }, + { + "auxiliary_loss_clip": 0.01149953, + "auxiliary_loss_mlp": 0.01108832, + "balance_loss_clip": 1.00192869, + "balance_loss_mlp": 1.00058961, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 1.9527475951732223, + "language_loss": 0.69261032, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71519816, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.538867473602295 + }, + { + "auxiliary_loss_clip": 0.01149616, + "auxiliary_loss_mlp": 0.01108509, + "balance_loss_clip": 1.00198877, + "balance_loss_mlp": 1.0006485, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.6292593117025491, + "language_loss": 0.69902796, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.72160918, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.5904762744903564 + }, + { + "auxiliary_loss_clip": 0.01149576, + "auxiliary_loss_mlp": 0.01107405, + "balance_loss_clip": 1.00209224, + "balance_loss_mlp": 1.00049829, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1.8377224792708795, + "language_loss": 0.84618783, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86875761, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.5491135120391846 + }, + { + "auxiliary_loss_clip": 0.01100589, + "auxiliary_loss_mlp": 0.01108603, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00055194, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.7688635237921788, + "language_loss": 0.83956003, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86165202, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.832197904586792 + }, + { + "auxiliary_loss_clip": 0.01148929, + "auxiliary_loss_mlp": 0.01107313, + "balance_loss_clip": 1.00175917, + "balance_loss_mlp": 1.00050199, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.582211466046658, + "language_loss": 0.69339103, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71595347, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.651078224182129 + }, + { + "auxiliary_loss_clip": 0.01149575, + "auxiliary_loss_mlp": 0.01108425, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00046933, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 2.237833907556369, + "language_loss": 0.65599155, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67857158, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.6582257747650146 + }, + { + "auxiliary_loss_clip": 0.01151658, + "auxiliary_loss_mlp": 0.01108804, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00056171, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7356086961255899, + "language_loss": 0.71171725, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73432183, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.518065929412842 + }, + { + "auxiliary_loss_clip": 0.01120357, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00060582, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.4416819253739868, + "language_loss": 0.81455213, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83683079, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 4.075404644012451 + }, + { + "auxiliary_loss_clip": 0.0113477, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00052977, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.632134620559776, + "language_loss": 0.6890623, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71149671, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.779448986053467 + }, + { + "auxiliary_loss_clip": 0.01166302, + "auxiliary_loss_mlp": 0.011083, + "balance_loss_clip": 1.00198328, + "balance_loss_mlp": 1.00053501, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2340690230305387, + "language_loss": 0.86463118, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88737726, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.46244215965271 + }, + { + "auxiliary_loss_clip": 0.01166323, + "auxiliary_loss_mlp": 0.01108283, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00061333, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.7286201240488497, + "language_loss": 0.72036344, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74310946, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.5524959564208984 + }, + { + "auxiliary_loss_clip": 0.01132471, + "auxiliary_loss_mlp": 0.01108313, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00054741, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.5678563877421985, + "language_loss": 0.6918577, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71426558, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.614015579223633 + }, + { + "auxiliary_loss_clip": 0.01103077, + "auxiliary_loss_mlp": 0.01108464, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00050783, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.656960437678067, + "language_loss": 0.72369307, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74580848, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.8176321983337402 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.0074603, + "balance_loss_clip": 1.00129437, + "balance_loss_mlp": 1.00028253, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7009387048924494, + "language_loss": 0.54508722, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56371927, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.291703939437866 + }, + { + "auxiliary_loss_clip": 0.01134682, + "auxiliary_loss_mlp": 0.01107817, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00052845, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 2.2379797707807723, + "language_loss": 0.77277672, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79520166, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.6230998039245605 + }, + { + "auxiliary_loss_clip": 0.01119528, + "auxiliary_loss_mlp": 0.0074757, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00084054, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 1.9761973579663907, + "language_loss": 0.66420859, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68287963, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.677579879760742 + }, + { + "auxiliary_loss_clip": 0.01149436, + "auxiliary_loss_mlp": 0.01107705, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00051224, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.4580959157669873, + "language_loss": 0.6836729, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70624429, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.603487253189087 + }, + { + "auxiliary_loss_clip": 0.01133149, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00059223, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 1.886995378538393, + "language_loss": 0.84136933, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86377865, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 4.004753589630127 + }, + { + "auxiliary_loss_clip": 0.01134764, + "auxiliary_loss_mlp": 0.01108013, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00053334, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.4212642002736096, + "language_loss": 0.74922073, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77164853, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 3.990588903427124 + }, + { + "auxiliary_loss_clip": 0.0113342, + "auxiliary_loss_mlp": 0.0110794, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00065112, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.8505811409133717, + "language_loss": 0.72752249, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.7499361, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 2.559170722961426 + }, + { + "auxiliary_loss_clip": 0.01117662, + "auxiliary_loss_mlp": 0.01107474, + "balance_loss_clip": 1.00169003, + "balance_loss_mlp": 1.00047195, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 3.422061497360914, + "language_loss": 0.82962275, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.85187411, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.6381099224090576 + }, + { + "auxiliary_loss_clip": 0.01149483, + "auxiliary_loss_mlp": 0.01106466, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00041747, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.4661751016019589, + "language_loss": 0.72682828, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74938774, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.5611324310302734 + }, + { + "auxiliary_loss_clip": 0.01132809, + "auxiliary_loss_mlp": 0.01107242, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.8602942010984633, + "language_loss": 0.76645446, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.78885502, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 4.024447202682495 + }, + { + "auxiliary_loss_clip": 0.0110048, + "auxiliary_loss_mlp": 0.01106865, + "balance_loss_clip": 1.00166452, + "balance_loss_mlp": 1.00043488, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.3517417008786217, + "language_loss": 0.79757416, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.81964767, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.6668572425842285 + }, + { + "auxiliary_loss_clip": 0.01136109, + "auxiliary_loss_mlp": 0.01107447, + "balance_loss_clip": 1.00198185, + "balance_loss_mlp": 1.00044429, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.1110820479875456, + "language_loss": 0.67286569, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69530129, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.6175382137298584 + }, + { + "auxiliary_loss_clip": 0.01149596, + "auxiliary_loss_mlp": 0.01108419, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.00055885, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6609589347287879, + "language_loss": 0.78931355, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81189376, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.5089669227600098 + }, + { + "auxiliary_loss_clip": 0.01134681, + "auxiliary_loss_mlp": 0.01108171, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.0005008, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.4281815274574963, + "language_loss": 0.88847768, + "learning_rate": 1.231081372744317e-06, + "loss": 0.9109062, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.5941925048828125 + }, + { + "auxiliary_loss_clip": 0.01151448, + "auxiliary_loss_mlp": 0.01107621, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00061846, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.3869577164704965, + "language_loss": 0.68292367, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70551437, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.565185308456421 + }, + { + "auxiliary_loss_clip": 0.01104219, + "auxiliary_loss_mlp": 0.01106786, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00045109, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.8690156218332623, + "language_loss": 0.63672435, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65883446, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.7713077068328857 + }, + { + "auxiliary_loss_clip": 0.01144724, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_clip": 1.00113106, + "balance_loss_mlp": 1.00009727, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.8014374780189587, + "language_loss": 0.54648167, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56879961, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.2264182567596436 + }, + { + "auxiliary_loss_clip": 0.01166445, + "auxiliary_loss_mlp": 0.01108084, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00060439, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.8432914816158874, + "language_loss": 0.66644895, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.68919426, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.5224151611328125 + }, + { + "auxiliary_loss_clip": 0.01151681, + "auxiliary_loss_mlp": 0.01107812, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00061882, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 1.9657479083251757, + "language_loss": 0.78935981, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.81195474, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.5259737968444824 + }, + { + "auxiliary_loss_clip": 0.0114961, + "auxiliary_loss_mlp": 0.01108077, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00050259, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 2.419246364966719, + "language_loss": 0.74924636, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77182317, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.542269468307495 + }, + { + "auxiliary_loss_clip": 0.01117817, + "auxiliary_loss_mlp": 0.00747394, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.00067902, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.8496620743985783, + "language_loss": 0.68713611, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70578825, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.5909934043884277 + }, + { + "auxiliary_loss_clip": 0.01117185, + "auxiliary_loss_mlp": 0.01108325, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00046408, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.098790119534169, + "language_loss": 0.80576289, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82801795, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.6491687297821045 + }, + { + "auxiliary_loss_clip": 0.01151359, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00072742, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.5980235946716699, + "language_loss": 0.79817879, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82077348, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.6361944675445557 + }, + { + "auxiliary_loss_clip": 0.01116589, + "auxiliary_loss_mlp": 0.01107709, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00061142, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 2.1311648195948956, + "language_loss": 0.6738438, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69608676, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.6622536182403564 + }, + { + "auxiliary_loss_clip": 0.01073157, + "auxiliary_loss_mlp": 0.0110787, + "balance_loss_clip": 1.00161815, + "balance_loss_mlp": 1.00039136, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.6704604934280902, + "language_loss": 0.79943621, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.82124645, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.8697047233581543 + }, + { + "auxiliary_loss_clip": 0.01100541, + "auxiliary_loss_mlp": 0.00747445, + "balance_loss_clip": 1.00159609, + "balance_loss_mlp": 1.00082958, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 1.9703220846344132, + "language_loss": 0.76645654, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78493643, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.8482236862182617 + }, + { + "auxiliary_loss_clip": 0.01134321, + "auxiliary_loss_mlp": 0.01108031, + "balance_loss_clip": 1.00180161, + "balance_loss_mlp": 1.00045633, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.6124012702020871, + "language_loss": 0.76773405, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79015756, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.591104030609131 + }, + { + "auxiliary_loss_clip": 0.01085922, + "auxiliary_loss_mlp": 0.01108408, + "balance_loss_clip": 1.00164628, + "balance_loss_mlp": 1.00045204, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.8332391649212438, + "language_loss": 0.65731561, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67925888, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.7175848484039307 + }, + { + "auxiliary_loss_clip": 0.01132856, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_clip": 1.00184143, + "balance_loss_mlp": 1.00054371, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.9624474213146093, + "language_loss": 0.75252825, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77492654, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01108683, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00063157, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.6292574278969143, + "language_loss": 0.65971237, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68212211, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.881554126739502 + }, + { + "auxiliary_loss_clip": 0.01069062, + "auxiliary_loss_mlp": 0.01086316, + "balance_loss_clip": 1.00125551, + "balance_loss_mlp": 1.00010395, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7276459797258952, + "language_loss": 0.51910824, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.54066205, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 4.782243967056274 + }, + { + "auxiliary_loss_clip": 0.01150768, + "auxiliary_loss_mlp": 0.0110627, + "balance_loss_clip": 1.00183976, + "balance_loss_mlp": 1.00041258, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.4901325288779275, + "language_loss": 0.74790156, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.77047199, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 2.80702543258667 + }, + { + "auxiliary_loss_clip": 0.01128187, + "auxiliary_loss_mlp": 0.01086573, + "balance_loss_clip": 1.00128245, + "balance_loss_mlp": 0.99997967, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8285440388817163, + "language_loss": 0.63119704, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65334463, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.1659677028656006 + }, + { + "auxiliary_loss_clip": 0.0114955, + "auxiliary_loss_mlp": 0.01107517, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00051427, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 1.935736384579089, + "language_loss": 0.72805667, + "learning_rate": 1.223896654187282e-06, + "loss": 0.75062728, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.6063196659088135 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01086653, + "balance_loss_clip": 1.0013268, + "balance_loss_mlp": 1.00005937, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7110532558350662, + "language_loss": 0.5785237, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.60070688, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 3.0546271800994873 + }, + { + "auxiliary_loss_clip": 0.01103554, + "auxiliary_loss_mlp": 0.01107591, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00068378, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.9860749614876876, + "language_loss": 0.75444317, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77655458, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.691868305206299 + }, + { + "auxiliary_loss_clip": 0.01134806, + "auxiliary_loss_mlp": 0.00747482, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00077438, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.7778274813664092, + "language_loss": 0.79754484, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81636775, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.614961624145508 + }, + { + "auxiliary_loss_clip": 0.01128723, + "auxiliary_loss_mlp": 0.01086778, + "balance_loss_clip": 1.00117183, + "balance_loss_mlp": 1.0001843, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6526492274410752, + "language_loss": 0.55636352, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57851845, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.2158095836639404 + }, + { + "auxiliary_loss_clip": 0.01136522, + "auxiliary_loss_mlp": 0.01108301, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00063169, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.7455082826648416, + "language_loss": 0.84105796, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.8635062, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.570188522338867 + }, + { + "auxiliary_loss_clip": 0.01149497, + "auxiliary_loss_mlp": 0.01108104, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.00052905, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.7447367844248685, + "language_loss": 0.86783755, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89041358, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.519888401031494 + }, + { + "auxiliary_loss_clip": 0.01086826, + "auxiliary_loss_mlp": 0.0110697, + "balance_loss_clip": 1.0016892, + "balance_loss_mlp": 1.00063586, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.6669103363047189, + "language_loss": 0.73536927, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75730723, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.6779396533966064 + }, + { + "auxiliary_loss_clip": 0.01134989, + "auxiliary_loss_mlp": 0.01108888, + "balance_loss_clip": 1.00198305, + "balance_loss_mlp": 1.00064588, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.9471180129890986, + "language_loss": 0.76615059, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78858936, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 3.9803626537323 + }, + { + "auxiliary_loss_clip": 0.01133009, + "auxiliary_loss_mlp": 0.01107924, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00054038, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 3.5254348741227486, + "language_loss": 0.70335221, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72576153, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.622128963470459 + }, + { + "auxiliary_loss_clip": 0.01134007, + "auxiliary_loss_mlp": 0.01106256, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00049365, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.5787330810333906, + "language_loss": 0.77821332, + "learning_rate": 1.220308702586529e-06, + "loss": 0.80061597, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 4.28193998336792 + }, + { + "auxiliary_loss_clip": 0.01118175, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00056386, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.6746477546340666, + "language_loss": 0.74857074, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77082431, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 2.621583938598633 + }, + { + "auxiliary_loss_clip": 0.01134535, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_clip": 1.00179636, + "balance_loss_mlp": 1.00043225, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.396055778207929, + "language_loss": 0.76506674, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78747976, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 2.6125078201293945 + }, + { + "auxiliary_loss_clip": 0.01084493, + "auxiliary_loss_mlp": 0.01107765, + "balance_loss_clip": 1.0015521, + "balance_loss_mlp": 1.00047636, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.5358130029031192, + "language_loss": 0.80455351, + "learning_rate": 1.21923289302382e-06, + "loss": 0.8264761, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 4.1316819190979 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01108858, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00052023, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.7866503902998347, + "language_loss": 0.72867101, + "learning_rate": 1.218874349031654e-06, + "loss": 0.7510888, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.5449836254119873 + }, + { + "auxiliary_loss_clip": 0.01133748, + "auxiliary_loss_mlp": 0.01108761, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00051928, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.5680203146388036, + "language_loss": 0.72749627, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74992138, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.545858383178711 + }, + { + "auxiliary_loss_clip": 0.01120455, + "auxiliary_loss_mlp": 0.01108436, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00057542, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.7653087185818923, + "language_loss": 0.67330712, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69559598, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.692861318588257 + }, + { + "auxiliary_loss_clip": 0.0116612, + "auxiliary_loss_mlp": 0.01106753, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00041878, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 2.264427514280511, + "language_loss": 0.67808378, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70081246, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.5192298889160156 + }, + { + "auxiliary_loss_clip": 0.0111847, + "auxiliary_loss_mlp": 0.01109398, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00067949, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 2.047263231441596, + "language_loss": 0.75347269, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77575135, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.6240017414093018 + }, + { + "auxiliary_loss_clip": 0.01136039, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00047636, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.658714104605318, + "language_loss": 0.70673382, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72916907, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.580392599105835 + }, + { + "auxiliary_loss_clip": 0.01131391, + "auxiliary_loss_mlp": 0.0108622, + "balance_loss_clip": 1.00167465, + "balance_loss_mlp": 1.00000751, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7673262759477686, + "language_loss": 0.62898624, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65116239, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.217566728591919 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01107025, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00059462, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.8683942502624844, + "language_loss": 0.66507399, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68747342, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.5943830013275146 + }, + { + "auxiliary_loss_clip": 0.01068246, + "auxiliary_loss_mlp": 0.01106825, + "balance_loss_clip": 1.0017575, + "balance_loss_mlp": 1.00039542, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 1.8348430636152238, + "language_loss": 0.81939727, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84114802, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.7198336124420166 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.01108113, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00072944, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.8665083091153931, + "language_loss": 0.75039589, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77280754, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.5725655555725098 + }, + { + "auxiliary_loss_clip": 0.0114971, + "auxiliary_loss_mlp": 0.0110797, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.0006814, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 9.710436560677056, + "language_loss": 0.71320581, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73578262, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.5840067863464355 + }, + { + "auxiliary_loss_clip": 0.01132847, + "auxiliary_loss_mlp": 0.01108252, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00058246, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.6497804605962039, + "language_loss": 0.73718667, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.7595976, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.567295789718628 + }, + { + "auxiliary_loss_clip": 0.01149739, + "auxiliary_loss_mlp": 0.01108399, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00044298, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.810681374692332, + "language_loss": 0.77652246, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.5252037048339844 + }, + { + "auxiliary_loss_clip": 0.01132797, + "auxiliary_loss_mlp": 0.01107448, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00054109, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 2.5494212550812425, + "language_loss": 0.81646323, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83886576, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.6316120624542236 + }, + { + "auxiliary_loss_clip": 0.01128915, + "auxiliary_loss_mlp": 0.01086293, + "balance_loss_clip": 1.00125492, + "balance_loss_mlp": 1.00008094, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8127761637793053, + "language_loss": 0.59006011, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61221218, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.118912696838379 + }, + { + "auxiliary_loss_clip": 0.01136889, + "auxiliary_loss_mlp": 0.01107813, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00052476, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.6441785557784212, + "language_loss": 0.78708076, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80952775, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.5754241943359375 + }, + { + "auxiliary_loss_clip": 0.01101925, + "auxiliary_loss_mlp": 0.01109228, + "balance_loss_clip": 1.00155604, + "balance_loss_mlp": 1.00060463, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.4855371811093383, + "language_loss": 0.63334185, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65545344, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.697345733642578 + }, + { + "auxiliary_loss_clip": 0.01129451, + "auxiliary_loss_mlp": 0.01086534, + "balance_loss_clip": 1.00130773, + "balance_loss_mlp": 0.99994051, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9490926604197292, + "language_loss": 0.55946863, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58162844, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.0913586616516113 + }, + { + "auxiliary_loss_clip": 0.01118183, + "auxiliary_loss_mlp": 0.0110871, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00037241, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 1.8836714053665577, + "language_loss": 0.76790023, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79016912, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.6249754428863525 + }, + { + "auxiliary_loss_clip": 0.01116182, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_clip": 1.00191367, + "balance_loss_mlp": 1.00065804, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.3917915852109333, + "language_loss": 0.82375753, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84600163, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.6718533039093018 + }, + { + "auxiliary_loss_clip": 0.01151688, + "auxiliary_loss_mlp": 0.01107603, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00060081, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 2.24440345634739, + "language_loss": 0.73664057, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75923353, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.6230788230895996 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.0110815, + "balance_loss_clip": 1.00168407, + "balance_loss_mlp": 1.0004797, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.233903288944514, + "language_loss": 0.800933, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82321632, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 3.963873863220215 + }, + { + "auxiliary_loss_clip": 0.01105176, + "auxiliary_loss_mlp": 0.01107704, + "balance_loss_clip": 1.00194585, + "balance_loss_mlp": 1.00051069, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.8090751778289773, + "language_loss": 0.75833774, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.78046656, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.69101619720459 + }, + { + "auxiliary_loss_clip": 0.01135179, + "auxiliary_loss_mlp": 0.01107566, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00056422, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 2.121399035949873, + "language_loss": 0.78663504, + "learning_rate": 1.210636039936138e-06, + "loss": 0.8090626, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.5933046340942383 + }, + { + "auxiliary_loss_clip": 0.0108524, + "auxiliary_loss_mlp": 0.0110793, + "balance_loss_clip": 1.00144613, + "balance_loss_mlp": 1.00054622, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.6262892790233232, + "language_loss": 0.75331211, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77524382, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.654092311859131 + }, + { + "auxiliary_loss_clip": 0.01166227, + "auxiliary_loss_mlp": 0.01107773, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00058043, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 2.391581342557736, + "language_loss": 0.70748687, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73022693, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.505871295928955 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.0110832, + "balance_loss_clip": 1.00181592, + "balance_loss_mlp": 1.00065041, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.9889421537050067, + "language_loss": 0.63668734, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65895063, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 2.626899003982544 + }, + { + "auxiliary_loss_clip": 0.0113678, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_clip": 1.00204194, + "balance_loss_mlp": 1.00051451, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.0750147610309786, + "language_loss": 0.7908541, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81329894, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.5374250411987305 + }, + { + "auxiliary_loss_clip": 0.0113657, + "auxiliary_loss_mlp": 0.01110679, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00062466, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.27660883238899, + "language_loss": 0.70256209, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72503459, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.591900110244751 + }, + { + "auxiliary_loss_clip": 0.01149653, + "auxiliary_loss_mlp": 0.01109092, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.0005641, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.4848018544056583, + "language_loss": 0.72773445, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75032187, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.5410993099212646 + }, + { + "auxiliary_loss_clip": 0.01116784, + "auxiliary_loss_mlp": 0.01108662, + "balance_loss_clip": 1.00174606, + "balance_loss_mlp": 1.00061035, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5256076025386065, + "language_loss": 0.83006316, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85231769, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.6910147666931152 + }, + { + "auxiliary_loss_clip": 0.01099673, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00064969, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.3129191874790345, + "language_loss": 0.72211432, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74419141, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.614361047744751 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01108349, + "balance_loss_clip": 1.00180256, + "balance_loss_mlp": 1.00067902, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.037142949873632, + "language_loss": 0.77447015, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79672521, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 4.066961288452148 + }, + { + "auxiliary_loss_clip": 0.0116636, + "auxiliary_loss_mlp": 0.01108727, + "balance_loss_clip": 1.00202727, + "balance_loss_mlp": 1.00067592, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 2.589121527724187, + "language_loss": 0.76415682, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78690773, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 3.9220216274261475 + }, + { + "auxiliary_loss_clip": 0.01149602, + "auxiliary_loss_mlp": 0.01108844, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00050688, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.7230716255006524, + "language_loss": 0.77883732, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80142182, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.533607244491577 + }, + { + "auxiliary_loss_clip": 0.01135326, + "auxiliary_loss_mlp": 0.01109733, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00063252, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 1.973725792773983, + "language_loss": 0.68112689, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70357752, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 2.569206476211548 + }, + { + "auxiliary_loss_clip": 0.01166387, + "auxiliary_loss_mlp": 0.01107839, + "balance_loss_clip": 1.00221169, + "balance_loss_mlp": 1.00064564, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.6292991151986167, + "language_loss": 0.75796533, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78070754, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 3.96867036819458 + }, + { + "auxiliary_loss_clip": 0.01151038, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00054002, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 4.281555718742499, + "language_loss": 0.69440138, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71699387, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.5675153732299805 + }, + { + "auxiliary_loss_clip": 0.01115532, + "auxiliary_loss_mlp": 0.01108635, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00077415, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.1365825114594625, + "language_loss": 0.68041635, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70265794, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.6807639598846436 + }, + { + "auxiliary_loss_clip": 0.01134645, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00056851, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.7581648084082637, + "language_loss": 0.66326946, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68569446, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.6087214946746826 + }, + { + "auxiliary_loss_clip": 0.01151289, + "auxiliary_loss_mlp": 0.01107696, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00050318, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.584780171438657, + "language_loss": 0.64465028, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66724014, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.5947775840759277 + }, + { + "auxiliary_loss_clip": 0.01149062, + "auxiliary_loss_mlp": 0.01107558, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00055623, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.5257394546909657, + "language_loss": 0.7117517, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.7343179, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.5237228870391846 + }, + { + "auxiliary_loss_clip": 0.01088213, + "auxiliary_loss_mlp": 0.00747825, + "balance_loss_clip": 1.00174165, + "balance_loss_mlp": 1.00095987, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.242454179552308, + "language_loss": 0.77280426, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79116464, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.712705612182617 + }, + { + "auxiliary_loss_clip": 0.01150723, + "auxiliary_loss_mlp": 0.01108381, + "balance_loss_clip": 1.00205827, + "balance_loss_mlp": 1.00052071, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.645717585786371, + "language_loss": 0.67501485, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69760597, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.5677101612091064 + }, + { + "auxiliary_loss_clip": 0.01150921, + "auxiliary_loss_mlp": 0.01109051, + "balance_loss_clip": 1.00209987, + "balance_loss_mlp": 1.00071347, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.6922176012437342, + "language_loss": 0.78651249, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80911219, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.5358076095581055 + }, + { + "auxiliary_loss_clip": 0.01116119, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00071001, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.1538472769004873, + "language_loss": 0.88907516, + "learning_rate": 1.20277073264638e-06, + "loss": 0.91132587, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.6039812564849854 + }, + { + "auxiliary_loss_clip": 0.01151393, + "auxiliary_loss_mlp": 0.01107417, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.0005101, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.4054848552619967, + "language_loss": 0.69130141, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71388954, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.5143134593963623 + }, + { + "auxiliary_loss_clip": 0.01150739, + "auxiliary_loss_mlp": 0.01109093, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00037396, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.8721297525510183, + "language_loss": 0.73823142, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76082969, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.5685172080993652 + }, + { + "auxiliary_loss_clip": 0.01117876, + "auxiliary_loss_mlp": 0.01108715, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00066376, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 2.2885143570233226, + "language_loss": 0.69333124, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71559715, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.6674580574035645 + }, + { + "auxiliary_loss_clip": 0.01166373, + "auxiliary_loss_mlp": 0.01108945, + "balance_loss_clip": 1.00198138, + "balance_loss_mlp": 1.00041664, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 2.0085732569818453, + "language_loss": 0.66109979, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68385303, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.498845338821411 + }, + { + "auxiliary_loss_clip": 0.01166354, + "auxiliary_loss_mlp": 0.01108138, + "balance_loss_clip": 1.00215483, + "balance_loss_mlp": 1.00056326, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.6989897012011632, + "language_loss": 0.66482168, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68756658, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.507359266281128 + }, + { + "auxiliary_loss_clip": 0.0116638, + "auxiliary_loss_mlp": 0.01108707, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00046551, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.1366248153168965, + "language_loss": 0.76264453, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.78539538, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.5611672401428223 + }, + { + "auxiliary_loss_clip": 0.01128357, + "auxiliary_loss_mlp": 0.01085987, + "balance_loss_clip": 1.00134659, + "balance_loss_mlp": 1.0001564, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7632022981528356, + "language_loss": 0.60735077, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62949419, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.2340540885925293 + }, + { + "auxiliary_loss_clip": 0.01151421, + "auxiliary_loss_mlp": 0.01107315, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00050318, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6010110748247246, + "language_loss": 0.67225593, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69484329, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.540113687515259 + }, + { + "auxiliary_loss_clip": 0.01151329, + "auxiliary_loss_mlp": 0.01108699, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00045657, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.611681504374042, + "language_loss": 0.73401165, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75661194, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.572323799133301 + }, + { + "auxiliary_loss_clip": 0.01115945, + "auxiliary_loss_mlp": 0.01107467, + "balance_loss_clip": 1.00172269, + "balance_loss_mlp": 1.00055981, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.648221706726715, + "language_loss": 0.68314624, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70538032, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.6617746353149414 + }, + { + "auxiliary_loss_clip": 0.01166104, + "auxiliary_loss_mlp": 0.0110796, + "balance_loss_clip": 1.00189102, + "balance_loss_mlp": 1.00048065, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.5820190580080047, + "language_loss": 0.74798024, + "learning_rate": 1.198843556910427e-06, + "loss": 0.77072084, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.510361909866333 + }, + { + "auxiliary_loss_clip": 0.01086189, + "auxiliary_loss_mlp": 0.01108684, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.00044203, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.4688470282756485, + "language_loss": 0.7913515, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81330025, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.7253944873809814 + }, + { + "auxiliary_loss_clip": 0.01166297, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00059748, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.721194425505335, + "language_loss": 0.6726104, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69535989, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 3.916189670562744 + }, + { + "auxiliary_loss_clip": 0.01149553, + "auxiliary_loss_mlp": 0.01107933, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00054884, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.0195013679734135, + "language_loss": 0.71556836, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73814321, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.5735185146331787 + }, + { + "auxiliary_loss_clip": 0.01117863, + "auxiliary_loss_mlp": 0.0110758, + "balance_loss_clip": 1.00169182, + "balance_loss_mlp": 1.00057769, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.8368888813361646, + "language_loss": 0.75470454, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77695894, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 2.615558385848999 + }, + { + "auxiliary_loss_clip": 0.01116094, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_clip": 1.00192606, + "balance_loss_mlp": 1.00057435, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.6639352114144033, + "language_loss": 0.68862861, + "learning_rate": 1.197059691144867e-06, + "loss": 0.71088248, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.6664671897888184 + }, + { + "auxiliary_loss_clip": 0.01134814, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00050151, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.9575035622685322, + "language_loss": 0.66323406, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68567157, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.64503812789917 + }, + { + "auxiliary_loss_clip": 0.01166222, + "auxiliary_loss_mlp": 0.01108333, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00056827, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 5.318456025626678, + "language_loss": 0.73348546, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75623101, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 2.4827423095703125 + }, + { + "auxiliary_loss_clip": 0.01147554, + "auxiliary_loss_mlp": 0.01107439, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00043702, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.023519477399448, + "language_loss": 0.71617043, + "learning_rate": 1.195989736948226e-06, + "loss": 0.73872042, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.538041353225708 + }, + { + "auxiliary_loss_clip": 0.01134935, + "auxiliary_loss_mlp": 0.01107268, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00045633, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.7543946985472703, + "language_loss": 0.77454102, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.79696304, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.553408622741699 + }, + { + "auxiliary_loss_clip": 0.01132369, + "auxiliary_loss_mlp": 0.01108769, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00062251, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.6245012318964493, + "language_loss": 0.74419028, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76660174, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.555832624435425 + }, + { + "auxiliary_loss_clip": 0.01149471, + "auxiliary_loss_mlp": 0.01107417, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00070047, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.8429972103289034, + "language_loss": 0.6162113, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63878018, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.565208911895752 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01108182, + "balance_loss_clip": 1.00163388, + "balance_loss_mlp": 1.00051212, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.5379632865328627, + "language_loss": 0.59705806, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61930162, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.705111503601074 + }, + { + "auxiliary_loss_clip": 0.01132723, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00065279, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.3232111451240425, + "language_loss": 0.79969585, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82210445, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 3.9731171131134033 + }, + { + "auxiliary_loss_clip": 0.01166174, + "auxiliary_loss_mlp": 0.01108277, + "balance_loss_clip": 1.00192273, + "balance_loss_mlp": 1.00060701, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.682355289504313, + "language_loss": 0.73632491, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75906944, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.5382745265960693 + }, + { + "auxiliary_loss_clip": 0.01119578, + "auxiliary_loss_mlp": 0.01107303, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.00049186, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 1.8774136347119188, + "language_loss": 0.75662208, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77889085, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 4.0516252517700195 + }, + { + "auxiliary_loss_clip": 0.01135086, + "auxiliary_loss_mlp": 0.0110788, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00059116, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4092903492509503, + "language_loss": 0.66309696, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68552661, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.696401357650757 + }, + { + "auxiliary_loss_clip": 0.01161852, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_clip": 1.00128841, + "balance_loss_mlp": 0.99999315, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8550134936672362, + "language_loss": 0.63468432, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65716487, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.0736420154571533 + }, + { + "auxiliary_loss_clip": 0.01150438, + "auxiliary_loss_mlp": 0.01106477, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00052369, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.7672246121919317, + "language_loss": 0.69525945, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71782857, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 3.9833076000213623 + }, + { + "auxiliary_loss_clip": 0.01166188, + "auxiliary_loss_mlp": 0.01107694, + "balance_loss_clip": 1.00197017, + "balance_loss_mlp": 1.00050116, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.7460760481468958, + "language_loss": 0.73393929, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.7566781, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.5542049407958984 + }, + { + "auxiliary_loss_clip": 0.01149091, + "auxiliary_loss_mlp": 0.01109001, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00047326, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 10.700804010210227, + "language_loss": 0.81708556, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.83966643, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.5004563331604004 + }, + { + "auxiliary_loss_clip": 0.01134411, + "auxiliary_loss_mlp": 0.01107406, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00078511, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.851264181056569, + "language_loss": 0.74195576, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.7643739, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 2.5632667541503906 + }, + { + "auxiliary_loss_clip": 0.01083327, + "auxiliary_loss_mlp": 0.0108572, + "balance_loss_clip": 1.00155878, + "balance_loss_mlp": 1.00027072, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.646709912288777, + "language_loss": 0.54605329, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56774378, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.3080942630767822 + }, + { + "auxiliary_loss_clip": 0.01116212, + "auxiliary_loss_mlp": 0.01107566, + "balance_loss_clip": 1.00179207, + "balance_loss_mlp": 1.0005641, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.543741820357563, + "language_loss": 0.7671302, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.78936791, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.987708568572998 + }, + { + "auxiliary_loss_clip": 0.01118641, + "auxiliary_loss_mlp": 0.01107146, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00071609, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.5737778644374887, + "language_loss": 0.79293001, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81518787, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.62079119682312 + }, + { + "auxiliary_loss_clip": 0.01120227, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00047731, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 1.7584115723816283, + "language_loss": 0.79843992, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82071698, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.621138334274292 + }, + { + "auxiliary_loss_clip": 0.01149511, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00052118, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.6238570061634834, + "language_loss": 0.8557176, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87828797, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.563737154006958 + }, + { + "auxiliary_loss_clip": 0.01102948, + "auxiliary_loss_mlp": 0.01109166, + "balance_loss_clip": 1.00176013, + "balance_loss_mlp": 1.00063848, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.420550435223085, + "language_loss": 0.65271974, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67484087, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.6384549140930176 + }, + { + "auxiliary_loss_clip": 0.01165978, + "auxiliary_loss_mlp": 0.01108012, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00053334, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.0293742198167153, + "language_loss": 0.80742365, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.83016348, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.5143966674804688 + }, + { + "auxiliary_loss_clip": 0.0115133, + "auxiliary_loss_mlp": 0.0110757, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00047266, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.8841550749341094, + "language_loss": 0.66168213, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68427116, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.621522903442383 + }, + { + "auxiliary_loss_clip": 0.01115184, + "auxiliary_loss_mlp": 0.01107326, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00051486, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.5940440671529583, + "language_loss": 0.78460383, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80682886, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.6611928939819336 + }, + { + "auxiliary_loss_clip": 0.0115145, + "auxiliary_loss_mlp": 0.01108565, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00060892, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.6290613937081881, + "language_loss": 0.82758558, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.85018569, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.5609078407287598 + }, + { + "auxiliary_loss_clip": 0.01165985, + "auxiliary_loss_mlp": 0.01107379, + "balance_loss_clip": 1.00193942, + "balance_loss_mlp": 1.00056767, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.8012410137540689, + "language_loss": 0.78317487, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80590856, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.5250236988067627 + }, + { + "auxiliary_loss_clip": 0.01119043, + "auxiliary_loss_mlp": 0.01107137, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.0005157, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.3580667581080395, + "language_loss": 0.81585324, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83811498, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.654262065887451 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01107403, + "balance_loss_clip": 1.00173593, + "balance_loss_mlp": 1.00049639, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.2523606305260344, + "language_loss": 0.81149077, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83391321, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.604067802429199 + }, + { + "auxiliary_loss_clip": 0.01115579, + "auxiliary_loss_mlp": 0.0110825, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00048494, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 2.78402002292665, + "language_loss": 0.78317022, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8054086, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.6785888671875 + }, + { + "auxiliary_loss_clip": 0.01166015, + "auxiliary_loss_mlp": 0.01106957, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00062275, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.5670915947337787, + "language_loss": 0.68324894, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70597863, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.5802407264709473 + }, + { + "auxiliary_loss_clip": 0.01145275, + "auxiliary_loss_mlp": 0.01085956, + "balance_loss_clip": 1.00125241, + "balance_loss_mlp": 1.00012541, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.8199068433678153, + "language_loss": 0.49602491, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51833725, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.3302347660064697 + }, + { + "auxiliary_loss_clip": 0.01166259, + "auxiliary_loss_mlp": 0.01108475, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.0006144, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.792304423984504, + "language_loss": 0.77719575, + "learning_rate": 1.18530534681967e-06, + "loss": 0.79994315, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.5550942420959473 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.01107297, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00048578, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.6256276837263246, + "language_loss": 0.76716781, + "learning_rate": 1.18494967730604e-06, + "loss": 0.7895776, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.614011764526367 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01107501, + "balance_loss_clip": 1.00189555, + "balance_loss_mlp": 1.00049901, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.3944538517466114, + "language_loss": 0.72826421, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75050485, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.664074420928955 + }, + { + "auxiliary_loss_clip": 0.01166151, + "auxiliary_loss_mlp": 0.01107092, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00047147, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.464884943547051, + "language_loss": 0.77925378, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80198622, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 3.9658334255218506 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.01108224, + "balance_loss_clip": 1.0019908, + "balance_loss_mlp": 1.00055361, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.5764514312539786, + "language_loss": 0.58400416, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60660326, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.5863475799560547 + }, + { + "auxiliary_loss_clip": 0.0114931, + "auxiliary_loss_mlp": 0.0110649, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00063169, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.6189239294402833, + "language_loss": 0.83677524, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85933328, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.5346872806549072 + }, + { + "auxiliary_loss_clip": 0.01134666, + "auxiliary_loss_mlp": 0.01107562, + "balance_loss_clip": 1.00165474, + "balance_loss_mlp": 1.00055993, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.9838606961550602, + "language_loss": 0.81885219, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.8412745, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 2.6030142307281494 + }, + { + "auxiliary_loss_clip": 0.01151722, + "auxiliary_loss_mlp": 0.01107996, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00051689, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 1.840518271105348, + "language_loss": 0.81594241, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83853954, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 2.5382325649261475 + }, + { + "auxiliary_loss_clip": 0.01149942, + "auxiliary_loss_mlp": 0.01108171, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.00050163, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.514837059159131, + "language_loss": 0.78759664, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81017774, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.52004075050354 + }, + { + "auxiliary_loss_clip": 0.01052563, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_clip": 1.00170994, + "balance_loss_mlp": 1.00038886, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 3.0659874288110096, + "language_loss": 0.74206352, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76366591, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.8218603134155273 + }, + { + "auxiliary_loss_clip": 0.01115151, + "auxiliary_loss_mlp": 0.01108379, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00051892, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.5086846740350657, + "language_loss": 0.66281807, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68505335, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 2.634507894515991 + }, + { + "auxiliary_loss_clip": 0.01085181, + "auxiliary_loss_mlp": 0.01107976, + "balance_loss_clip": 1.00168049, + "balance_loss_mlp": 1.00049686, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.4410689063451987, + "language_loss": 0.63584185, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65777349, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.7015702724456787 + }, + { + "auxiliary_loss_clip": 0.01165969, + "auxiliary_loss_mlp": 0.01107119, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00049818, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 2.03140198511983, + "language_loss": 0.67716831, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.6998992, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.5254924297332764 + }, + { + "auxiliary_loss_clip": 0.01150268, + "auxiliary_loss_mlp": 0.01107038, + "balance_loss_clip": 1.00185418, + "balance_loss_mlp": 1.00060821, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 1.6041422659896905, + "language_loss": 0.75379699, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77637005, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 4.0437705516815186 + }, + { + "auxiliary_loss_clip": 0.01151511, + "auxiliary_loss_mlp": 0.01108863, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00052595, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 2.3104850396462053, + "language_loss": 0.66865641, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.69126016, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 3.9686954021453857 + }, + { + "auxiliary_loss_clip": 0.01166, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00061166, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.1788450974910454, + "language_loss": 0.7367301, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75946051, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.477875232696533 + }, + { + "auxiliary_loss_clip": 0.01086632, + "auxiliary_loss_mlp": 0.00747624, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00088036, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.9585084284484011, + "language_loss": 0.74870306, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76704562, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.7152812480926514 + }, + { + "auxiliary_loss_clip": 0.01149632, + "auxiliary_loss_mlp": 0.0110852, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00046885, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.7641660051465673, + "language_loss": 0.70455009, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72713161, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 3.9044342041015625 + }, + { + "auxiliary_loss_clip": 0.01147304, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_clip": 1.00122333, + "balance_loss_mlp": 1.00011468, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7755075233784705, + "language_loss": 0.58462816, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60696065, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.183987855911255 + }, + { + "auxiliary_loss_clip": 0.01117851, + "auxiliary_loss_mlp": 0.01106742, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00040722, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.8684544923062112, + "language_loss": 0.74760926, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76985526, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.6931164264678955 + }, + { + "auxiliary_loss_clip": 0.01132347, + "auxiliary_loss_mlp": 0.00747713, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00100899, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.6914843172678362, + "language_loss": 0.71526384, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73406446, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.6044423580169678 + }, + { + "auxiliary_loss_clip": 0.01132649, + "auxiliary_loss_mlp": 0.01085924, + "balance_loss_clip": 1.00127196, + "balance_loss_mlp": 1.00009346, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6693970583372123, + "language_loss": 0.55339289, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57557869, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.1599884033203125 + }, + { + "auxiliary_loss_clip": 0.01166117, + "auxiliary_loss_mlp": 0.0110698, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00055003, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 5.620631302200418, + "language_loss": 0.80446541, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82719642, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.512410879135132 + }, + { + "auxiliary_loss_clip": 0.01133934, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_clip": 1.0018121, + "balance_loss_mlp": 1.00044012, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5131242678362575, + "language_loss": 0.81123567, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83364272, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.617264986038208 + }, + { + "auxiliary_loss_clip": 0.01134073, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00055337, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.831651716023289, + "language_loss": 0.71572053, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.73813105, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.5604183673858643 + }, + { + "auxiliary_loss_clip": 0.01166085, + "auxiliary_loss_mlp": 0.01106948, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00042224, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.8988519277319935, + "language_loss": 0.66687542, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68960577, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.702028274536133 + }, + { + "auxiliary_loss_clip": 0.01151356, + "auxiliary_loss_mlp": 0.01108069, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00058937, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.2619474197098977, + "language_loss": 0.73494071, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75753498, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.508923053741455 + }, + { + "auxiliary_loss_clip": 0.01149463, + "auxiliary_loss_mlp": 0.01107374, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00056219, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.337253426045769, + "language_loss": 0.66742498, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68999326, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.5850024223327637 + }, + { + "auxiliary_loss_clip": 0.01131845, + "auxiliary_loss_mlp": 0.01107345, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00081968, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 2.1102389206918524, + "language_loss": 0.67269087, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69508278, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.56082820892334 + }, + { + "auxiliary_loss_clip": 0.01166221, + "auxiliary_loss_mlp": 0.01107992, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00070381, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.6727739569627789, + "language_loss": 0.75996411, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78270626, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.4970643520355225 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01107899, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.00061107, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.5755131777971187, + "language_loss": 0.76943654, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79171348, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.659299612045288 + }, + { + "auxiliary_loss_clip": 0.01134254, + "auxiliary_loss_mlp": 0.01107107, + "balance_loss_clip": 1.00185287, + "balance_loss_mlp": 1.00048578, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.220595407165012, + "language_loss": 0.68267643, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70509005, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.5748322010040283 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01108221, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.00055122, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.9816726283072663, + "language_loss": 0.7095052, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73190707, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.578312873840332 + }, + { + "auxiliary_loss_clip": 0.01120018, + "auxiliary_loss_mlp": 0.01108701, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00064993, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.7171640969961286, + "language_loss": 0.77948666, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80177391, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 2.5789670944213867 + }, + { + "auxiliary_loss_clip": 0.01166178, + "auxiliary_loss_mlp": 0.01107516, + "balance_loss_clip": 1.00202608, + "balance_loss_mlp": 1.00070429, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.6149207158094525, + "language_loss": 0.84966117, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87239814, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.512394666671753 + }, + { + "auxiliary_loss_clip": 0.01134631, + "auxiliary_loss_mlp": 0.01107817, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00071955, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.0625896764348597, + "language_loss": 0.59522915, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61765361, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.5514540672302246 + }, + { + "auxiliary_loss_clip": 0.01116527, + "auxiliary_loss_mlp": 0.01107369, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00065279, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.2164735734555325, + "language_loss": 0.68040717, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70264614, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.601026773452759 + }, + { + "auxiliary_loss_clip": 0.01104172, + "auxiliary_loss_mlp": 0.01107932, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.00045252, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.322969176800039, + "language_loss": 0.73895419, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76107526, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.7098641395568848 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01106831, + "balance_loss_clip": 1.00170791, + "balance_loss_mlp": 1.00049591, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.4104076804086032, + "language_loss": 0.74343348, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76552558, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.6919102668762207 + }, + { + "auxiliary_loss_clip": 0.01115897, + "auxiliary_loss_mlp": 0.01108595, + "balance_loss_clip": 1.00175309, + "balance_loss_mlp": 1.00054359, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.534010259842256, + "language_loss": 0.67683244, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69907737, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.669423818588257 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01109043, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00060987, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.512435739840456, + "language_loss": 0.7547226, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77701408, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.6777560710906982 + }, + { + "auxiliary_loss_clip": 0.01135795, + "auxiliary_loss_mlp": 0.01107569, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00047147, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.4356495775781921, + "language_loss": 0.65041673, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67285037, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 4.257268190383911 + }, + { + "auxiliary_loss_clip": 0.0110155, + "auxiliary_loss_mlp": 0.01108662, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00051534, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 1.8848851014164982, + "language_loss": 0.69272077, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71482289, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.688713550567627 + }, + { + "auxiliary_loss_clip": 0.01166302, + "auxiliary_loss_mlp": 0.01108274, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00050843, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.058555368186997, + "language_loss": 0.82676977, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.84951556, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.5020058155059814 + }, + { + "auxiliary_loss_clip": 0.01161884, + "auxiliary_loss_mlp": 0.01085967, + "balance_loss_clip": 1.00120711, + "balance_loss_mlp": 1.00013661, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7104663510795599, + "language_loss": 0.57752728, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.60000587, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.2716667652130127 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01107246, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00052977, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 2.8035418968801173, + "language_loss": 0.6079421, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.63019472, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.75453519821167 + }, + { + "auxiliary_loss_clip": 0.0116598, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_clip": 1.00196528, + "balance_loss_mlp": 1.00050378, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.8467254335052783, + "language_loss": 0.63105071, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65378177, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 2.59549880027771 + }, + { + "auxiliary_loss_clip": 0.01134827, + "auxiliary_loss_mlp": 0.01108275, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00060523, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.5211150061532692, + "language_loss": 0.75402558, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77645659, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.6285932064056396 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.00200891, + "balance_loss_mlp": 1.00056899, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 2.099714729138149, + "language_loss": 0.77697313, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79953635, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.530202627182007 + }, + { + "auxiliary_loss_clip": 0.01101545, + "auxiliary_loss_mlp": 0.0110737, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00046289, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6684916641344, + "language_loss": 0.71735245, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73944157, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.6919586658477783 + }, + { + "auxiliary_loss_clip": 0.0116611, + "auxiliary_loss_mlp": 0.01106851, + "balance_loss_clip": 1.00199354, + "balance_loss_mlp": 1.00061178, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9549955450532819, + "language_loss": 0.72239327, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74512291, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.513979911804199 + }, + { + "auxiliary_loss_clip": 0.01104308, + "auxiliary_loss_mlp": 0.01108218, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.000453, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.7228926134481364, + "language_loss": 0.73267823, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75480354, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 4.171593904495239 + }, + { + "auxiliary_loss_clip": 0.01120044, + "auxiliary_loss_mlp": 0.01108125, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00064588, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.7215140635576087, + "language_loss": 0.73606521, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.75834692, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.593505859375 + }, + { + "auxiliary_loss_clip": 0.01134298, + "auxiliary_loss_mlp": 0.01105743, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.00055325, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4914076411281156, + "language_loss": 0.82901013, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85141057, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 4.120572566986084 + }, + { + "auxiliary_loss_clip": 0.01148791, + "auxiliary_loss_mlp": 0.00747588, + "balance_loss_clip": 1.00178635, + "balance_loss_mlp": 1.00097871, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.4919316121434598, + "language_loss": 0.78210461, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80106843, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 2.5506958961486816 + }, + { + "auxiliary_loss_clip": 0.01149396, + "auxiliary_loss_mlp": 0.0110771, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00070739, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.0185482954578617, + "language_loss": 0.68795395, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71052504, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 4.019811391830444 + }, + { + "auxiliary_loss_clip": 0.0111952, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00060964, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.7599176337721731, + "language_loss": 0.65444463, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67671692, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.653294324874878 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01107688, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00059032, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.4791973240468925, + "language_loss": 0.7920301, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81445491, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.5743279457092285 + }, + { + "auxiliary_loss_clip": 0.01149429, + "auxiliary_loss_mlp": 0.01107345, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00062943, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 1.8093503280747252, + "language_loss": 0.73721409, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75978184, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 2.5729153156280518 + }, + { + "auxiliary_loss_clip": 0.01149284, + "auxiliary_loss_mlp": 0.01107024, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00049829, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4078037033957949, + "language_loss": 0.78123105, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80379415, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.5909481048583984 + }, + { + "auxiliary_loss_clip": 0.01144766, + "auxiliary_loss_mlp": 0.01086067, + "balance_loss_clip": 1.0012908, + "balance_loss_mlp": 1.00023675, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7185724738448327, + "language_loss": 0.59428, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61658829, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.1143579483032227 + }, + { + "auxiliary_loss_clip": 0.01039208, + "auxiliary_loss_mlp": 0.01106666, + "balance_loss_clip": 1.00154424, + "balance_loss_mlp": 1.00052166, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 2.004670984853345, + "language_loss": 0.79252124, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81397998, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 3.0260517597198486 + }, + { + "auxiliary_loss_clip": 0.01166169, + "auxiliary_loss_mlp": 0.01107812, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00052357, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.2404815678657624, + "language_loss": 0.78603184, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.80877161, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 2.7106144428253174 + }, + { + "auxiliary_loss_clip": 0.01149419, + "auxiliary_loss_mlp": 0.00747643, + "balance_loss_clip": 1.001845, + "balance_loss_mlp": 1.00095439, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 1.934387103087602, + "language_loss": 0.64333379, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66230446, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.594705581665039 + }, + { + "auxiliary_loss_clip": 0.01166292, + "auxiliary_loss_mlp": 0.01107941, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00065303, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.7291266949128545, + "language_loss": 0.88762826, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.91037059, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.533487558364868 + }, + { + "auxiliary_loss_clip": 0.01134512, + "auxiliary_loss_mlp": 0.01106725, + "balance_loss_clip": 1.0018307, + "balance_loss_mlp": 1.0005815, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.066415023713077, + "language_loss": 0.72910321, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75151563, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.5712339878082275 + }, + { + "auxiliary_loss_clip": 0.01117714, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_clip": 1.00171006, + "balance_loss_mlp": 1.00045919, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.5171602823871784, + "language_loss": 0.69317561, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71541309, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.756517171859741 + }, + { + "auxiliary_loss_clip": 0.01118226, + "auxiliary_loss_mlp": 0.01107626, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.0006237, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.9624587882760631, + "language_loss": 0.7132917, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73555028, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.691521644592285 + }, + { + "auxiliary_loss_clip": 0.01166169, + "auxiliary_loss_mlp": 0.01108034, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.0005548, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 1.8346063256728755, + "language_loss": 0.84282386, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.8655659, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.504525661468506 + }, + { + "auxiliary_loss_clip": 0.01118653, + "auxiliary_loss_mlp": 0.01107283, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00066197, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 2.044939121775288, + "language_loss": 0.77578872, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79804814, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.603651523590088 + }, + { + "auxiliary_loss_clip": 0.01151284, + "auxiliary_loss_mlp": 0.01106785, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00054598, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.638070900222915, + "language_loss": 0.75609863, + "learning_rate": 1.160483857897479e-06, + "loss": 0.77867937, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.7008018493652344 + }, + { + "auxiliary_loss_clip": 0.01166218, + "auxiliary_loss_mlp": 0.01107611, + "balance_loss_clip": 1.00200236, + "balance_loss_mlp": 1.00060916, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 1.9912626883937803, + "language_loss": 0.59966964, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62240797, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.496001720428467 + }, + { + "auxiliary_loss_clip": 0.01115856, + "auxiliary_loss_mlp": 0.01107633, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.00063062, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.598429724486088, + "language_loss": 0.85639191, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.87862676, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.65197491645813 + }, + { + "auxiliary_loss_clip": 0.01134797, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00061727, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 1.9657633014946514, + "language_loss": 0.78026587, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80269575, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 2.6078920364379883 + }, + { + "auxiliary_loss_clip": 0.0111748, + "auxiliary_loss_mlp": 0.01107416, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00050879, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 2.095865901462109, + "language_loss": 0.7428627, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76511168, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.650200366973877 + }, + { + "auxiliary_loss_clip": 0.01149299, + "auxiliary_loss_mlp": 0.0074778, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.00106013, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.9879774684147424, + "language_loss": 0.69648558, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71545637, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.608147621154785 + }, + { + "auxiliary_loss_clip": 0.0113493, + "auxiliary_loss_mlp": 0.0110844, + "balance_loss_clip": 1.00182927, + "balance_loss_mlp": 1.00057971, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 2.5024794476578913, + "language_loss": 0.53858936, + "learning_rate": 1.158363494676679e-06, + "loss": 0.561023, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.6310653686523438 + }, + { + "auxiliary_loss_clip": 0.01149483, + "auxiliary_loss_mlp": 0.01107836, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00054717, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.4404548919056772, + "language_loss": 0.77896893, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.80154216, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.6053202152252197 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_clip": 1.00175631, + "balance_loss_mlp": 1.00055194, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 1.9148807716877336, + "language_loss": 0.70754635, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.7296195, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.6747398376464844 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00061142, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.6831361068720767, + "language_loss": 0.76895273, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79101235, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 4.084362030029297 + }, + { + "auxiliary_loss_clip": 0.01149764, + "auxiliary_loss_mlp": 0.01108141, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00066161, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.7940698791014735, + "language_loss": 0.71482098, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.73739994, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.634178400039673 + }, + { + "auxiliary_loss_clip": 0.01147173, + "auxiliary_loss_mlp": 0.01085846, + "balance_loss_clip": 1.00119054, + "balance_loss_mlp": 1.00001574, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7734122329300408, + "language_loss": 0.6027931, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62512338, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.2505857944488525 + }, + { + "auxiliary_loss_clip": 0.01151522, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00068748, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.6353336669640883, + "language_loss": 0.78624117, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80883801, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.5829906463623047 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01107811, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00071359, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.8660546160926876, + "language_loss": 0.74572527, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76846361, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.529174566268921 + }, + { + "auxiliary_loss_clip": 0.01086251, + "auxiliary_loss_mlp": 0.01107344, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00053287, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 1.9084544532846366, + "language_loss": 0.69803846, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.7199744, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.7107226848602295 + }, + { + "auxiliary_loss_clip": 0.01149419, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.0005486, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.7224588290060119, + "language_loss": 0.72702706, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74959582, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.568942070007324 + }, + { + "auxiliary_loss_clip": 0.01132815, + "auxiliary_loss_mlp": 0.01108019, + "balance_loss_clip": 1.001724, + "balance_loss_mlp": 1.00044489, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 7.011420564151918, + "language_loss": 0.64964271, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.67205101, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.6508066654205322 + }, + { + "auxiliary_loss_clip": 0.01132814, + "auxiliary_loss_mlp": 0.00747852, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.00114369, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.078490053246721, + "language_loss": 0.78510195, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80390859, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 2.5883917808532715 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01085935, + "balance_loss_clip": 1.00121593, + "balance_loss_mlp": 1.00010419, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7981225202484868, + "language_loss": 0.58919102, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.61135703, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.3346290588378906 + }, + { + "auxiliary_loss_clip": 0.01134675, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00044131, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.8113207239450866, + "language_loss": 0.63495725, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.6573813, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 4.1014015674591064 + }, + { + "auxiliary_loss_clip": 0.01149413, + "auxiliary_loss_mlp": 0.007476, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00088882, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.5991739644509975, + "language_loss": 0.81295943, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83192962, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 4.426043748855591 + }, + { + "auxiliary_loss_clip": 0.01114302, + "auxiliary_loss_mlp": 0.0110698, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.0006454, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.5958601172460896, + "language_loss": 0.71390539, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.7361182, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.7080390453338623 + }, + { + "auxiliary_loss_clip": 0.01084883, + "auxiliary_loss_mlp": 0.0110752, + "balance_loss_clip": 1.00184953, + "balance_loss_mlp": 1.00051761, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.5047859697903552, + "language_loss": 0.77572173, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79764575, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 4.216289043426514 + }, + { + "auxiliary_loss_clip": 0.0114971, + "auxiliary_loss_mlp": 0.01108827, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00049007, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.8056903651328782, + "language_loss": 0.8540132, + "learning_rate": 1.152362047854413e-06, + "loss": 0.8765986, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.568019390106201 + }, + { + "auxiliary_loss_clip": 0.0112009, + "auxiliary_loss_mlp": 0.01107846, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00065351, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.7172548706857351, + "language_loss": 0.79752994, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.81980932, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.6074576377868652 + }, + { + "auxiliary_loss_clip": 0.01103195, + "auxiliary_loss_mlp": 0.00747882, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.0010891, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.6140632389224892, + "language_loss": 0.65343922, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67194998, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.8644745349884033 + }, + { + "auxiliary_loss_clip": 0.01166404, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00046158, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.99806680508441, + "language_loss": 0.75307608, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77582902, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134854, + "auxiliary_loss_mlp": 0.01107577, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00047946, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.6747465171379587, + "language_loss": 0.7304157, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75284004, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.6004302501678467 + }, + { + "auxiliary_loss_clip": 0.01120243, + "auxiliary_loss_mlp": 0.01108084, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00060463, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4449197368439841, + "language_loss": 0.72092533, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74320853, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 3.0441083908081055 + }, + { + "auxiliary_loss_clip": 0.01119028, + "auxiliary_loss_mlp": 0.01107505, + "balance_loss_clip": 1.00180411, + "balance_loss_mlp": 1.00050306, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 3.0432202820670224, + "language_loss": 0.65053177, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67279714, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.6564295291900635 + }, + { + "auxiliary_loss_clip": 0.01119851, + "auxiliary_loss_mlp": 0.01107067, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00054133, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 2.040492101688808, + "language_loss": 0.83886278, + "learning_rate": 1.14989356009286e-06, + "loss": 0.86113191, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.6648175716400146 + }, + { + "auxiliary_loss_clip": 0.01151431, + "auxiliary_loss_mlp": 0.01108082, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00050783, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.425351219892114, + "language_loss": 0.7774322, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80002725, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 2.568445920944214 + }, + { + "auxiliary_loss_clip": 0.01116969, + "auxiliary_loss_mlp": 0.01106534, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00048494, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.5256246535386573, + "language_loss": 0.80077088, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82300591, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.633514404296875 + }, + { + "auxiliary_loss_clip": 0.01118097, + "auxiliary_loss_mlp": 0.0110651, + "balance_loss_clip": 1.00171101, + "balance_loss_mlp": 1.00046194, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.98891569485401, + "language_loss": 0.87570834, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89795446, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.610032796859741 + }, + { + "auxiliary_loss_clip": 0.01166182, + "auxiliary_loss_mlp": 0.01107379, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00056791, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.778975990736153, + "language_loss": 0.66323119, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68596685, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.542875051498413 + }, + { + "auxiliary_loss_clip": 0.01133044, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_clip": 1.00171304, + "balance_loss_mlp": 1.00050139, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 2.4384264219125114, + "language_loss": 0.88219172, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.90459144, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.5656962394714355 + }, + { + "auxiliary_loss_clip": 0.01133645, + "auxiliary_loss_mlp": 0.01107916, + "balance_loss_clip": 1.00170302, + "balance_loss_mlp": 1.0004369, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.029730687022801, + "language_loss": 0.73705709, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75947273, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.589402437210083 + }, + { + "auxiliary_loss_clip": 0.01150866, + "auxiliary_loss_mlp": 0.01107397, + "balance_loss_clip": 1.00189912, + "balance_loss_mlp": 1.00058532, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.9001871932278913, + "language_loss": 0.68840528, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71098787, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.5224671363830566 + }, + { + "auxiliary_loss_clip": 0.01134865, + "auxiliary_loss_mlp": 0.01107543, + "balance_loss_clip": 1.00183737, + "balance_loss_mlp": 1.00044513, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 5.1164811866342665, + "language_loss": 0.76640552, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.78882957, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.6246895790100098 + }, + { + "auxiliary_loss_clip": 0.01149337, + "auxiliary_loss_mlp": 0.01108116, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00044608, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 2.00933458177456, + "language_loss": 0.88930428, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91187876, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.556286334991455 + }, + { + "auxiliary_loss_clip": 0.01161892, + "auxiliary_loss_mlp": 0.01085911, + "balance_loss_clip": 1.0011878, + "balance_loss_mlp": 1.00008035, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 2.8035206853141843, + "language_loss": 0.55393994, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57641798, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.246392011642456 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01108956, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00052309, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.8791250333347351, + "language_loss": 0.7470963, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76936722, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.653921604156494 + }, + { + "auxiliary_loss_clip": 0.01131154, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_clip": 1.00113511, + "balance_loss_mlp": 1.00011849, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6414430848188228, + "language_loss": 0.51014, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53231484, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.266279935836792 + }, + { + "auxiliary_loss_clip": 0.01133886, + "auxiliary_loss_mlp": 0.01108452, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00040054, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.0090953707321524, + "language_loss": 0.83504915, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85747254, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.6070556640625 + }, + { + "auxiliary_loss_clip": 0.01133119, + "auxiliary_loss_mlp": 0.01108176, + "balance_loss_clip": 1.00198305, + "balance_loss_mlp": 1.00060105, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0279959046791824, + "language_loss": 0.83303595, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85544884, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.569554328918457 + }, + { + "auxiliary_loss_clip": 0.01151452, + "auxiliary_loss_mlp": 0.01108413, + "balance_loss_clip": 1.00200474, + "balance_loss_mlp": 1.00064802, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.6631062057513664, + "language_loss": 0.77290857, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79550719, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 4.056194305419922 + }, + { + "auxiliary_loss_clip": 0.01134824, + "auxiliary_loss_mlp": 0.01107761, + "balance_loss_clip": 1.0017308, + "balance_loss_mlp": 1.00066304, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.5451998722400948, + "language_loss": 0.77341688, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79584277, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.600264549255371 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.01107952, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00066352, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.089612500181093, + "language_loss": 0.82120687, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84346974, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 2.612835645675659 + }, + { + "auxiliary_loss_clip": 0.01116084, + "auxiliary_loss_mlp": 0.01107948, + "balance_loss_clip": 1.00161743, + "balance_loss_mlp": 1.00046921, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.9109901640869331, + "language_loss": 0.5883317, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61057198, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.8505258560180664 + }, + { + "auxiliary_loss_clip": 0.01161887, + "auxiliary_loss_mlp": 0.01085851, + "balance_loss_clip": 1.00115442, + "balance_loss_mlp": 1.00002027, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7268206449271417, + "language_loss": 0.60897768, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63145506, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.1443631649017334 + }, + { + "auxiliary_loss_clip": 0.01133023, + "auxiliary_loss_mlp": 0.01107156, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00053549, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.5280840569884064, + "language_loss": 0.67789888, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70030063, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.7315821647644043 + }, + { + "auxiliary_loss_clip": 0.01100864, + "auxiliary_loss_mlp": 0.01107233, + "balance_loss_clip": 1.00160027, + "balance_loss_mlp": 1.00042164, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.0839432336832533, + "language_loss": 0.73489606, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75697702, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.697970390319824 + }, + { + "auxiliary_loss_clip": 0.01166207, + "auxiliary_loss_mlp": 0.01108286, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.0008074, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.4217830162336427, + "language_loss": 0.62924224, + "learning_rate": 1.142145760331648e-06, + "loss": 0.6519872, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.566707134246826 + }, + { + "auxiliary_loss_clip": 0.01146594, + "auxiliary_loss_mlp": 0.01085924, + "balance_loss_clip": 1.00117242, + "balance_loss_mlp": 1.00009334, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8249649597424498, + "language_loss": 0.5610894, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58341455, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 2.9475138187408447 + }, + { + "auxiliary_loss_clip": 0.01149476, + "auxiliary_loss_mlp": 0.01108353, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00058818, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.5996132568644938, + "language_loss": 0.82771707, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.85029531, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.5416347980499268 + }, + { + "auxiliary_loss_clip": 0.01149713, + "auxiliary_loss_mlp": 0.01107545, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00063801, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 1.7769019487971578, + "language_loss": 0.60094213, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62351477, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.5933187007904053 + }, + { + "auxiliary_loss_clip": 0.01150581, + "auxiliary_loss_mlp": 0.01107471, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00056434, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.7427728810548873, + "language_loss": 0.79573131, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81831181, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 4.124828338623047 + }, + { + "auxiliary_loss_clip": 0.01147293, + "auxiliary_loss_mlp": 0.01085924, + "balance_loss_clip": 1.00115609, + "balance_loss_mlp": 1.00009286, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7063503062559663, + "language_loss": 0.60216498, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62449718, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 3.213406801223755 + }, + { + "auxiliary_loss_clip": 0.01166348, + "auxiliary_loss_mlp": 0.0110875, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.00069869, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.782896876621684, + "language_loss": 0.80905765, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83180863, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 4.034061670303345 + }, + { + "auxiliary_loss_clip": 0.01131298, + "auxiliary_loss_mlp": 0.01107216, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.0005002, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.1370034981583017, + "language_loss": 0.74579483, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76818001, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 2.612689733505249 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_clip": 1.00177801, + "balance_loss_mlp": 1.0005312, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.3852426458719092, + "language_loss": 0.68142647, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70352602, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 4.1007959842681885 + }, + { + "auxiliary_loss_clip": 0.01132603, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00088871, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.6500221203199235, + "language_loss": 0.66551566, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68431824, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.621600866317749 + }, + { + "auxiliary_loss_clip": 0.01132955, + "auxiliary_loss_mlp": 0.01108139, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00056493, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.1735112271961827, + "language_loss": 0.73442298, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75683391, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.01132888, + "auxiliary_loss_mlp": 0.01108391, + "balance_loss_clip": 1.0016588, + "balance_loss_mlp": 1.00043535, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 2.212945576839672, + "language_loss": 0.66564214, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68805492, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.5871188640594482 + }, + { + "auxiliary_loss_clip": 0.01117084, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_clip": 1.00167048, + "balance_loss_mlp": 1.00022018, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7262223133221737, + "language_loss": 0.63056338, + "learning_rate": 1.137926314758634e-06, + "loss": 0.6525948, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.3543879985809326 + }, + { + "auxiliary_loss_clip": 0.01151179, + "auxiliary_loss_mlp": 0.01108824, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00058162, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.6313388358534449, + "language_loss": 0.77437198, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79697204, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.7165679931640625 + }, + { + "auxiliary_loss_clip": 0.01119882, + "auxiliary_loss_mlp": 0.0110765, + "balance_loss_clip": 1.00178528, + "balance_loss_mlp": 1.00045753, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.712377629165491, + "language_loss": 0.78814703, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.8104223, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.7176811695098877 + }, + { + "auxiliary_loss_clip": 0.01166232, + "auxiliary_loss_mlp": 0.01108523, + "balance_loss_clip": 1.00196123, + "balance_loss_mlp": 1.00047183, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.5767267263405307, + "language_loss": 0.73803544, + "learning_rate": 1.136872187988815e-06, + "loss": 0.76078296, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 2.5935328006744385 + }, + { + "auxiliary_loss_clip": 0.01136477, + "auxiliary_loss_mlp": 0.01107461, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00064957, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.1140895059244493, + "language_loss": 0.62564576, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64808512, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.563899517059326 + }, + { + "auxiliary_loss_clip": 0.01166023, + "auxiliary_loss_mlp": 0.01107762, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00056911, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.6708531382297376, + "language_loss": 0.78492677, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80766457, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.495041847229004 + }, + { + "auxiliary_loss_clip": 0.01149333, + "auxiliary_loss_mlp": 0.01107974, + "balance_loss_clip": 1.00180542, + "balance_loss_mlp": 1.00039959, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.4638106966276287, + "language_loss": 0.67886066, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70143378, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.5423600673675537 + }, + { + "auxiliary_loss_clip": 0.01149561, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_clip": 1.00201738, + "balance_loss_mlp": 1.00052428, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 1.8260178690399036, + "language_loss": 0.66537368, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68795222, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 2.5222861766815186 + }, + { + "auxiliary_loss_clip": 0.0113267, + "auxiliary_loss_mlp": 0.0110766, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00056255, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 2.0831781959285216, + "language_loss": 0.64815652, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67055988, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.6931374073028564 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_clip": 1.00180078, + "balance_loss_mlp": 1.0006597, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.6972104772493912, + "language_loss": 0.77249014, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79490805, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.599217653274536 + }, + { + "auxiliary_loss_clip": 0.01131597, + "auxiliary_loss_mlp": 0.01107022, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00049639, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.7056890970271659, + "language_loss": 0.74903351, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77141964, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.5856425762176514 + }, + { + "auxiliary_loss_clip": 0.01149322, + "auxiliary_loss_mlp": 0.01107082, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00046086, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.843692985291632, + "language_loss": 0.85952628, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88209039, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.605855703353882 + }, + { + "auxiliary_loss_clip": 0.01121547, + "auxiliary_loss_mlp": 0.00747671, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00096238, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.6445735352452795, + "language_loss": 0.81456268, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83325487, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.6325788497924805 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01107519, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00051713, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.3875159739006677, + "language_loss": 0.82082009, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84323454, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.6137187480926514 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.01107658, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00036967, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 2.3211739973982377, + "language_loss": 0.80604315, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.82843226, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.579697370529175 + }, + { + "auxiliary_loss_clip": 0.01117929, + "auxiliary_loss_mlp": 0.01108174, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00050437, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 3.365131047011768, + "language_loss": 0.79981744, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.82207847, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.594376802444458 + }, + { + "auxiliary_loss_clip": 0.01149511, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00063753, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 1.7742796502786333, + "language_loss": 0.72180486, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74437737, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.5688533782958984 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01108221, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00064671, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 7.049096740219989, + "language_loss": 0.74548471, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.76774776, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.6472668647766113 + }, + { + "auxiliary_loss_clip": 0.01147725, + "auxiliary_loss_mlp": 0.00747529, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00087094, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.558220513350196, + "language_loss": 0.55627537, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57522792, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.5646286010742188 + }, + { + "auxiliary_loss_clip": 0.01134222, + "auxiliary_loss_mlp": 0.01107638, + "balance_loss_clip": 1.00201929, + "balance_loss_mlp": 1.0006361, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.505526518479437, + "language_loss": 0.74807227, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77049088, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.601045608520508 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01107074, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00054812, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.9991823867296268, + "language_loss": 0.75535476, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77791959, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 3.9474987983703613 + }, + { + "auxiliary_loss_clip": 0.01118461, + "auxiliary_loss_mlp": 0.01107974, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00078058, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.4609244110643893, + "language_loss": 0.81539166, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83765602, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.662942409515381 + }, + { + "auxiliary_loss_clip": 0.01166091, + "auxiliary_loss_mlp": 0.01107912, + "balance_loss_clip": 1.00187147, + "balance_loss_mlp": 1.00062346, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.5989703859139666, + "language_loss": 0.69927812, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72201812, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.5714497566223145 + }, + { + "auxiliary_loss_clip": 0.01050083, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00070786, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 5.130086353935247, + "language_loss": 0.7931245, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81469864, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.759756326675415 + }, + { + "auxiliary_loss_clip": 0.01134424, + "auxiliary_loss_mlp": 0.00747602, + "balance_loss_clip": 1.00174224, + "balance_loss_mlp": 1.00086844, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 3.6278871515125104, + "language_loss": 0.7951684, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81398869, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 2.624133348464966 + }, + { + "auxiliary_loss_clip": 0.01135595, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_clip": 1.00172019, + "balance_loss_mlp": 1.0004406, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.9329446210495775, + "language_loss": 0.84178722, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86421096, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.72139573097229 + }, + { + "auxiliary_loss_clip": 0.01131999, + "auxiliary_loss_mlp": 0.01108011, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.00043702, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.8950385347548924, + "language_loss": 0.71631956, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73871958, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.5467629432678223 + }, + { + "auxiliary_loss_clip": 0.01118, + "auxiliary_loss_mlp": 0.01106355, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00059247, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.7598828194087688, + "language_loss": 0.83971101, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86195457, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.610948085784912 + }, + { + "auxiliary_loss_clip": 0.01118381, + "auxiliary_loss_mlp": 0.01108016, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00063229, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 2.325914758946353, + "language_loss": 0.77702236, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79928637, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.6025197505950928 + }, + { + "auxiliary_loss_clip": 0.01166116, + "auxiliary_loss_mlp": 0.01108377, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00042105, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.6115005324895333, + "language_loss": 0.81827486, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84101981, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.4956421852111816 + }, + { + "auxiliary_loss_clip": 0.01100035, + "auxiliary_loss_mlp": 0.01108509, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00064814, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.490143727679683, + "language_loss": 0.85263222, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87471771, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 4.114210605621338 + }, + { + "auxiliary_loss_clip": 0.01134861, + "auxiliary_loss_mlp": 0.01108755, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00060844, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.3736453429455198, + "language_loss": 0.80374622, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82618237, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 3.9683961868286133 + }, + { + "auxiliary_loss_clip": 0.01117334, + "auxiliary_loss_mlp": 0.01106704, + "balance_loss_clip": 1.0017767, + "balance_loss_mlp": 1.00036931, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.69420231697258, + "language_loss": 0.71401942, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.73625976, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.629120349884033 + }, + { + "auxiliary_loss_clip": 0.01149591, + "auxiliary_loss_mlp": 0.0110733, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00042295, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.828097930469612, + "language_loss": 0.78039223, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80296141, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 2.542198419570923 + }, + { + "auxiliary_loss_clip": 0.01134774, + "auxiliary_loss_mlp": 0.01106619, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00057006, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 2.0715671676710463, + "language_loss": 0.79073757, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.8131516, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 3.980900764465332 + }, + { + "auxiliary_loss_clip": 0.01149562, + "auxiliary_loss_mlp": 0.01106465, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00041628, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5724590783167343, + "language_loss": 0.66885495, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.69141519, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.690896511077881 + }, + { + "auxiliary_loss_clip": 0.01134401, + "auxiliary_loss_mlp": 0.0110795, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00047112, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.3588277091301657, + "language_loss": 0.79782832, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82025182, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.596308708190918 + }, + { + "auxiliary_loss_clip": 0.01149308, + "auxiliary_loss_mlp": 0.007477, + "balance_loss_clip": 1.00176299, + "balance_loss_mlp": 1.00098693, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.7564526574318244, + "language_loss": 0.65600222, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.6749723, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.5866358280181885 + }, + { + "auxiliary_loss_clip": 0.01151242, + "auxiliary_loss_mlp": 0.01107214, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00059366, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.8290184887557022, + "language_loss": 0.79380524, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.8163898, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.551074743270874 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01107921, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.00053763, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 2.0715665364320204, + "language_loss": 0.78584135, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80841988, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 2.5789029598236084 + }, + { + "auxiliary_loss_clip": 0.01166197, + "auxiliary_loss_mlp": 0.01107963, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00048399, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.7017455388545937, + "language_loss": 0.70329314, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72603476, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.550086259841919 + }, + { + "auxiliary_loss_clip": 0.01151003, + "auxiliary_loss_mlp": 0.0110826, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00049508, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 2.8982527808649463, + "language_loss": 0.62610352, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64869618, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 2.529487133026123 + }, + { + "auxiliary_loss_clip": 0.01151269, + "auxiliary_loss_mlp": 0.01106879, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00054479, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.7560949114810271, + "language_loss": 0.78634328, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.80892479, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 2.51517915725708 + }, + { + "auxiliary_loss_clip": 0.0113387, + "auxiliary_loss_mlp": 0.01107346, + "balance_loss_clip": 1.00178611, + "balance_loss_mlp": 1.00053501, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.354624043718536, + "language_loss": 0.70524561, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72765779, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.6573398113250732 + }, + { + "auxiliary_loss_clip": 0.01166078, + "auxiliary_loss_mlp": 0.01107864, + "balance_loss_clip": 1.00187457, + "balance_loss_mlp": 1.0005753, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.5911802066077396, + "language_loss": 0.75750977, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.78024924, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.5292603969573975 + }, + { + "auxiliary_loss_clip": 0.01134665, + "auxiliary_loss_mlp": 0.01106659, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00061047, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.183260318478961, + "language_loss": 0.73393881, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75635207, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.577608108520508 + }, + { + "auxiliary_loss_clip": 0.01132099, + "auxiliary_loss_mlp": 0.01106872, + "balance_loss_clip": 1.00166202, + "balance_loss_mlp": 1.00053728, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.2808164693922617, + "language_loss": 0.55871427, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58110404, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.636827230453491 + }, + { + "auxiliary_loss_clip": 0.01149455, + "auxiliary_loss_mlp": 0.01107865, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.00048089, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.5861210290195722, + "language_loss": 0.76588035, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.78845352, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.5454864501953125 + }, + { + "auxiliary_loss_clip": 0.01166027, + "auxiliary_loss_mlp": 0.01106798, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.00046372, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.7363449062168232, + "language_loss": 0.7332232, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75595146, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.5236380100250244 + }, + { + "auxiliary_loss_clip": 0.01165993, + "auxiliary_loss_mlp": 0.01107528, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00062084, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.6697580775126402, + "language_loss": 0.67789364, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.70062888, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.519458293914795 + }, + { + "auxiliary_loss_clip": 0.01135197, + "auxiliary_loss_mlp": 0.00747734, + "balance_loss_clip": 1.00180805, + "balance_loss_mlp": 1.00088203, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.8850650153686197, + "language_loss": 0.66449636, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68332571, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.6537082195281982 + }, + { + "auxiliary_loss_clip": 0.01151286, + "auxiliary_loss_mlp": 0.01108191, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00052083, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 3.012784122266324, + "language_loss": 0.90586156, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92845631, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.5619406700134277 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01107157, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00044107, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 1.6898354235462512, + "language_loss": 0.75397885, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77655739, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.5808606147766113 + }, + { + "auxiliary_loss_clip": 0.01166206, + "auxiliary_loss_mlp": 0.01107993, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00070429, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.9498693372643703, + "language_loss": 0.74481738, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76755941, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.500051975250244 + }, + { + "auxiliary_loss_clip": 0.01117644, + "auxiliary_loss_mlp": 0.01107698, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00040996, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.6498286500836623, + "language_loss": 0.72297108, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74522448, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.685235023498535 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.01107994, + "balance_loss_clip": 1.00203657, + "balance_loss_mlp": 1.00061083, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.194185789541534, + "language_loss": 0.80859482, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83133614, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.4807369709014893 + }, + { + "auxiliary_loss_clip": 0.01166171, + "auxiliary_loss_mlp": 0.01108079, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00050497, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.4466250460019023, + "language_loss": 0.64134705, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66408956, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.5572996139526367 + }, + { + "auxiliary_loss_clip": 0.01136534, + "auxiliary_loss_mlp": 0.01108801, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00055921, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.1807958895038277, + "language_loss": 0.75457823, + "learning_rate": 1.117948625548313e-06, + "loss": 0.77703154, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.5597269535064697 + }, + { + "auxiliary_loss_clip": 0.01165954, + "auxiliary_loss_mlp": 0.01106732, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00068343, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.635647936934171, + "language_loss": 0.75876588, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.78149271, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 3.8909847736358643 + }, + { + "auxiliary_loss_clip": 0.01118623, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00087595, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.5642952046193048, + "language_loss": 0.77554643, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79420912, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 2.6086618900299072 + }, + { + "auxiliary_loss_clip": 0.01134167, + "auxiliary_loss_mlp": 0.0110593, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.00054932, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 2.0416899059836195, + "language_loss": 0.71209967, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73450065, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.589759349822998 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.01107079, + "balance_loss_clip": 1.00161958, + "balance_loss_mlp": 1.00045872, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.7043598894936862, + "language_loss": 0.74072564, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76296854, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.6599738597869873 + }, + { + "auxiliary_loss_clip": 0.01118114, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.0004462, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.8694543523148226, + "language_loss": 0.79556954, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81782424, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.6776342391967773 + }, + { + "auxiliary_loss_clip": 0.01133142, + "auxiliary_loss_mlp": 0.01107038, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00060773, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.7188993402568395, + "language_loss": 0.75978684, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78218865, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.565899610519409 + }, + { + "auxiliary_loss_clip": 0.01166067, + "auxiliary_loss_mlp": 0.00747599, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00080013, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.9680429804553161, + "language_loss": 0.70092869, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.72006536, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.5469913482666016 + }, + { + "auxiliary_loss_clip": 0.01117885, + "auxiliary_loss_mlp": 0.01106541, + "balance_loss_clip": 1.00180495, + "balance_loss_mlp": 1.00049257, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.5858571601984819, + "language_loss": 0.76070249, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78294677, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.6217267513275146 + }, + { + "auxiliary_loss_clip": 0.01145339, + "auxiliary_loss_mlp": 0.00746467, + "balance_loss_clip": 1.0012002, + "balance_loss_mlp": 1.00066817, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7135904535740206, + "language_loss": 0.53023428, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54915237, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.149714946746826 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.01107178, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00046182, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.5899568174657541, + "language_loss": 0.65511644, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67769605, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.6408450603485107 + }, + { + "auxiliary_loss_clip": 0.01135303, + "auxiliary_loss_mlp": 0.01106741, + "balance_loss_clip": 1.00172186, + "balance_loss_mlp": 1.00040627, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.697860419386328, + "language_loss": 0.80973423, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83215463, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 4.009931802749634 + }, + { + "auxiliary_loss_clip": 0.01099712, + "auxiliary_loss_mlp": 0.00747675, + "balance_loss_clip": 1.0016135, + "balance_loss_mlp": 1.00082505, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 7.296307429610259, + "language_loss": 0.7099421, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.72841597, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 4.148904800415039 + }, + { + "auxiliary_loss_clip": 0.01117288, + "auxiliary_loss_mlp": 0.01107717, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00061917, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.8922604046567828, + "language_loss": 0.80562097, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82787102, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 2.6022286415100098 + }, + { + "auxiliary_loss_clip": 0.01151365, + "auxiliary_loss_mlp": 0.01106563, + "balance_loss_clip": 1.00204647, + "balance_loss_mlp": 1.00051451, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.4856130929109288, + "language_loss": 0.72331041, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74588966, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.567509651184082 + }, + { + "auxiliary_loss_clip": 0.01151317, + "auxiliary_loss_mlp": 0.01107882, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00049853, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.01509664814793, + "language_loss": 0.72543335, + "learning_rate": 1.112709300197942e-06, + "loss": 0.7480253, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 3.9176206588745117 + }, + { + "auxiliary_loss_clip": 0.01100624, + "auxiliary_loss_mlp": 0.01108173, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00050342, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.691443569221511, + "language_loss": 0.72799706, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.750085, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 2.688607692718506 + }, + { + "auxiliary_loss_clip": 0.01111457, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_clip": 1.00118089, + "balance_loss_mlp": 1.00013602, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7315145149641797, + "language_loss": 0.64451361, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66648412, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.2181508541107178 + }, + { + "auxiliary_loss_clip": 0.01151143, + "auxiliary_loss_mlp": 0.01106953, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00052297, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.5766270802837332, + "language_loss": 0.77763814, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80021912, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.5946707725524902 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00054717, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.6769043468251372, + "language_loss": 0.65500998, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67727423, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.663618564605713 + }, + { + "auxiliary_loss_clip": 0.01102595, + "auxiliary_loss_mlp": 0.01106521, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.0004729, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 2.2544485103100804, + "language_loss": 0.70903975, + "learning_rate": 1.110964538515258e-06, + "loss": 0.73113096, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.6495797634124756 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.0110763, + "balance_loss_clip": 1.00185692, + "balance_loss_mlp": 1.00072312, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 1.8525123494885265, + "language_loss": 0.68303001, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70512116, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.6315767765045166 + }, + { + "auxiliary_loss_clip": 0.01134462, + "auxiliary_loss_mlp": 0.00747553, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00081015, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 1.6285055616694695, + "language_loss": 0.80049556, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.81931567, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.7581369876861572 + }, + { + "auxiliary_loss_clip": 0.01101089, + "auxiliary_loss_mlp": 0.0110767, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00047708, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7575477826059693, + "language_loss": 0.73913181, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.76121938, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.662886619567871 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01106771, + "balance_loss_clip": 1.00189221, + "balance_loss_mlp": 1.00043654, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.542579085776236, + "language_loss": 0.75971889, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78229654, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 2.730003595352173 + }, + { + "auxiliary_loss_clip": 0.01118314, + "auxiliary_loss_mlp": 0.01108211, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00073195, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 2.156034425140938, + "language_loss": 0.78081107, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80307627, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.6377112865448 + }, + { + "auxiliary_loss_clip": 0.01116586, + "auxiliary_loss_mlp": 0.01106287, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00061989, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 1.6759329904909686, + "language_loss": 0.69104671, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71327543, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.6217055320739746 + }, + { + "auxiliary_loss_clip": 0.01132575, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00057364, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.02639734320625, + "language_loss": 0.68445402, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70685744, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.5325522422790527 + }, + { + "auxiliary_loss_clip": 0.01134837, + "auxiliary_loss_mlp": 0.01107471, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00056362, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 1.9826201869421123, + "language_loss": 0.71797293, + "learning_rate": 1.108174673550927e-06, + "loss": 0.74039602, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.5536322593688965 + }, + { + "auxiliary_loss_clip": 0.01134583, + "auxiliary_loss_mlp": 0.00747542, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00085902, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.1931178369544795, + "language_loss": 0.77418602, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79300731, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.549130916595459 + }, + { + "auxiliary_loss_clip": 0.01104302, + "auxiliary_loss_mlp": 0.0110752, + "balance_loss_clip": 1.00180686, + "balance_loss_mlp": 1.00051737, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 1.8909813564726297, + "language_loss": 0.68102902, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70314723, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.7587521076202393 + }, + { + "auxiliary_loss_clip": 0.01151238, + "auxiliary_loss_mlp": 0.00747621, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00090134, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.7823798776677742, + "language_loss": 0.68346256, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70245111, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.5478546619415283 + }, + { + "auxiliary_loss_clip": 0.01121273, + "auxiliary_loss_mlp": 0.01108353, + "balance_loss_clip": 1.00185311, + "balance_loss_mlp": 1.00049257, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 1.7749063162955245, + "language_loss": 0.712897, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73519325, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.6321592330932617 + }, + { + "auxiliary_loss_clip": 0.01116376, + "auxiliary_loss_mlp": 0.01106867, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00062799, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.652186419706013, + "language_loss": 0.5922209, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61445332, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.7538392543792725 + }, + { + "auxiliary_loss_clip": 0.01149546, + "auxiliary_loss_mlp": 0.01107671, + "balance_loss_clip": 1.00188959, + "balance_loss_mlp": 1.00057292, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.438556431169897, + "language_loss": 0.7236253, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74619746, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.5858616828918457 + }, + { + "auxiliary_loss_clip": 0.01132912, + "auxiliary_loss_mlp": 0.01106462, + "balance_loss_clip": 1.00176489, + "balance_loss_mlp": 1.00050926, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.4483880193970953, + "language_loss": 0.70254219, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72493601, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.7670249938964844 + }, + { + "auxiliary_loss_clip": 0.01149551, + "auxiliary_loss_mlp": 0.01106928, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00059319, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.996115320122618, + "language_loss": 0.81902921, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84159398, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.5429508686065674 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.00747561, + "balance_loss_clip": 1.00168395, + "balance_loss_mlp": 1.00092864, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.7821886828104039, + "language_loss": 0.77355874, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79207087, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.794124126434326 + }, + { + "auxiliary_loss_clip": 0.0114923, + "auxiliary_loss_mlp": 0.01106336, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.0003829, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.6540377629354464, + "language_loss": 0.79026037, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81281602, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 3.9419195652008057 + }, + { + "auxiliary_loss_clip": 0.01145338, + "auxiliary_loss_mlp": 0.01085325, + "balance_loss_clip": 1.00120306, + "balance_loss_mlp": 1.00025749, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7342420533845697, + "language_loss": 0.61807388, + "learning_rate": 1.104342144597323e-06, + "loss": 0.6403805, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 3.223968505859375 + }, + { + "auxiliary_loss_clip": 0.01151076, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00069177, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 1.8709220579490629, + "language_loss": 0.66916651, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69174659, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.503042459487915 + }, + { + "auxiliary_loss_clip": 0.01151197, + "auxiliary_loss_mlp": 0.01106524, + "balance_loss_clip": 1.00186229, + "balance_loss_mlp": 1.00066638, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.6159159570989063, + "language_loss": 0.76537001, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.7879473, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.6029491424560547 + }, + { + "auxiliary_loss_clip": 0.0116604, + "auxiliary_loss_mlp": 0.01106336, + "balance_loss_clip": 1.00202918, + "balance_loss_mlp": 1.00047851, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.5739365305005188, + "language_loss": 0.73196602, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75468981, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.4871833324432373 + }, + { + "auxiliary_loss_clip": 0.01117674, + "auxiliary_loss_mlp": 0.01107726, + "balance_loss_clip": 1.00174069, + "balance_loss_mlp": 1.00072324, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 1.7826324581284747, + "language_loss": 0.78566772, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80792177, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.661292314529419 + }, + { + "auxiliary_loss_clip": 0.01136491, + "auxiliary_loss_mlp": 0.01107163, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.0005424, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 1.9855188197101517, + "language_loss": 0.68966246, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71209896, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.5480246543884277 + }, + { + "auxiliary_loss_clip": 0.01136209, + "auxiliary_loss_mlp": 0.0110652, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00066268, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.8390792362797213, + "language_loss": 0.80421948, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.8266468, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.6072449684143066 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01106443, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00058556, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 1.9229171898951163, + "language_loss": 0.80986887, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83242506, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.537932872772217 + }, + { + "auxiliary_loss_clip": 0.01131915, + "auxiliary_loss_mlp": 0.01105247, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00053394, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.6809797041459782, + "language_loss": 0.7604959, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78286749, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.766029119491577 + }, + { + "auxiliary_loss_clip": 0.011194, + "auxiliary_loss_mlp": 0.01106862, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00062251, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.5397396340352574, + "language_loss": 0.754296, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77655858, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 3.995852470397949 + }, + { + "auxiliary_loss_clip": 0.01150711, + "auxiliary_loss_mlp": 0.01106513, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00046432, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 2.13997775870443, + "language_loss": 0.6469847, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66955698, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.5575428009033203 + }, + { + "auxiliary_loss_clip": 0.01166199, + "auxiliary_loss_mlp": 0.01107941, + "balance_loss_clip": 1.0020088, + "balance_loss_mlp": 1.00055766, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.002751947630384, + "language_loss": 0.8187573, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.84149873, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 3.862776041030884 + }, + { + "auxiliary_loss_clip": 0.01119898, + "auxiliary_loss_mlp": 0.01107508, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00050581, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.5819232822907217, + "language_loss": 0.73483729, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75711131, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 2.6684794425964355 + }, + { + "auxiliary_loss_clip": 0.01151412, + "auxiliary_loss_mlp": 0.01107664, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00066185, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.9240277857141046, + "language_loss": 0.80029726, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82288802, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 2.5182688236236572 + }, + { + "auxiliary_loss_clip": 0.01099931, + "auxiliary_loss_mlp": 0.0074743, + "balance_loss_clip": 1.0017451, + "balance_loss_mlp": 1.00079036, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.5715820660253128, + "language_loss": 0.78630841, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80478197, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 4.041347026824951 + }, + { + "auxiliary_loss_clip": 0.0111994, + "auxiliary_loss_mlp": 0.0110704, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00051427, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.6399587341672168, + "language_loss": 0.73905444, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76132417, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 2.6566591262817383 + }, + { + "auxiliary_loss_clip": 0.0111918, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00051749, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.9621879690727688, + "language_loss": 0.73614848, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75841355, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.5966758728027344 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.01107177, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.0004611, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.608082822342699, + "language_loss": 0.77036536, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79294956, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 2.575622797012329 + }, + { + "auxiliary_loss_clip": 0.01146965, + "auxiliary_loss_mlp": 0.01085164, + "balance_loss_clip": 1.00122225, + "balance_loss_mlp": 1.0000968, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6928548004847975, + "language_loss": 0.48433739, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50665867, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.1103339195251465 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.01107189, + "balance_loss_clip": 1.0017072, + "balance_loss_mlp": 1.0003773, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.8034088022243455, + "language_loss": 0.79442894, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81656355, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.638930320739746 + }, + { + "auxiliary_loss_clip": 0.01151183, + "auxiliary_loss_mlp": 0.01106384, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00052643, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9273389046620433, + "language_loss": 0.65547425, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67804992, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.542710304260254 + }, + { + "auxiliary_loss_clip": 0.01151136, + "auxiliary_loss_mlp": 0.01105752, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00037122, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.5132889169034653, + "language_loss": 0.76474273, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78731161, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.5868375301361084 + }, + { + "auxiliary_loss_clip": 0.01089696, + "auxiliary_loss_mlp": 0.01107343, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00053096, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.2029127793299668, + "language_loss": 0.70203656, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72400695, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.7287864685058594 + }, + { + "auxiliary_loss_clip": 0.01149315, + "auxiliary_loss_mlp": 0.01106207, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00044441, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 1.8355713824900584, + "language_loss": 0.56138051, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58393574, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.6296489238739014 + }, + { + "auxiliary_loss_clip": 0.01132654, + "auxiliary_loss_mlp": 0.01107946, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.00056219, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 2.6753313600132493, + "language_loss": 0.78846258, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81086856, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.549396276473999 + }, + { + "auxiliary_loss_clip": 0.01149639, + "auxiliary_loss_mlp": 0.01107081, + "balance_loss_clip": 1.00176752, + "balance_loss_mlp": 1.00055587, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.543256505172527, + "language_loss": 0.68927884, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71184605, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.530459403991699 + }, + { + "auxiliary_loss_clip": 0.01148131, + "auxiliary_loss_mlp": 0.01106779, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00044394, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.5541572133030057, + "language_loss": 0.70784992, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.73039901, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.5580310821533203 + }, + { + "auxiliary_loss_clip": 0.0113269, + "auxiliary_loss_mlp": 0.01106301, + "balance_loss_clip": 1.00165105, + "balance_loss_mlp": 1.00053847, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.6231140687348387, + "language_loss": 0.67280543, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69519532, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.608045816421509 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01107281, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 1.9252604967774238, + "language_loss": 0.81483048, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83708578, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.653589963912964 + }, + { + "auxiliary_loss_clip": 0.01116859, + "auxiliary_loss_mlp": 0.01107265, + "balance_loss_clip": 1.00167346, + "balance_loss_mlp": 1.00064445, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.1872389327340036, + "language_loss": 0.67419434, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69643557, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.6230435371398926 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01106985, + "balance_loss_clip": 1.00182152, + "balance_loss_mlp": 1.00055456, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.217719510269244, + "language_loss": 0.73496568, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75738567, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.5916171073913574 + }, + { + "auxiliary_loss_clip": 0.01117898, + "auxiliary_loss_mlp": 0.01106225, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00046277, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.5959318923838144, + "language_loss": 0.72994173, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75218296, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.7541754245758057 + }, + { + "auxiliary_loss_clip": 0.01102098, + "auxiliary_loss_mlp": 0.01107023, + "balance_loss_clip": 1.00190914, + "balance_loss_mlp": 1.00049734, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.127294569542116, + "language_loss": 0.68569708, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.70778829, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.758814811706543 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.01107181, + "balance_loss_clip": 1.00187218, + "balance_loss_mlp": 1.00046456, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.6292057849051422, + "language_loss": 0.69479394, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71737826, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.6035032272338867 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.0110652, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.000471, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 2.8662367624836262, + "language_loss": 0.70580667, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72838312, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.6932971477508545 + }, + { + "auxiliary_loss_clip": 0.01117101, + "auxiliary_loss_mlp": 0.0110619, + "balance_loss_clip": 1.00162971, + "balance_loss_mlp": 1.00061882, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.5624306666717955, + "language_loss": 0.83866894, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.86090183, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.6053154468536377 + }, + { + "auxiliary_loss_clip": 0.01149165, + "auxiliary_loss_mlp": 0.01106916, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00048602, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.206682303943537, + "language_loss": 0.7430675, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76562828, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 4.007374286651611 + }, + { + "auxiliary_loss_clip": 0.01150631, + "auxiliary_loss_mlp": 0.01105787, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00050163, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.7211942965022542, + "language_loss": 0.79156888, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81413305, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.519132614135742 + }, + { + "auxiliary_loss_clip": 0.01131411, + "auxiliary_loss_mlp": 0.01084678, + "balance_loss_clip": 1.00172496, + "balance_loss_mlp": 0.9999916, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.813605749016126, + "language_loss": 0.54160768, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56376857, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.2373595237731934 + }, + { + "auxiliary_loss_clip": 0.01085606, + "auxiliary_loss_mlp": 0.01105668, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00066864, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.3933668661277032, + "language_loss": 0.7712906, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79320335, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 2.7513933181762695 + }, + { + "auxiliary_loss_clip": 0.01133969, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00062251, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9407702867690848, + "language_loss": 0.77129555, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.79369909, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.5507307052612305 + }, + { + "auxiliary_loss_clip": 0.01165958, + "auxiliary_loss_mlp": 0.01106573, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00042951, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 14.601426275898671, + "language_loss": 0.60669982, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62942517, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.5027339458465576 + }, + { + "auxiliary_loss_clip": 0.01134876, + "auxiliary_loss_mlp": 0.01107145, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00061917, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.175439770095459, + "language_loss": 0.6799823, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70240247, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.607593059539795 + }, + { + "auxiliary_loss_clip": 0.01149363, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.0004462, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 2.76213783455473, + "language_loss": 0.87727487, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89984208, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.5609073638916016 + }, + { + "auxiliary_loss_clip": 0.01151258, + "auxiliary_loss_mlp": 0.01108602, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00045526, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 2.356267461806026, + "language_loss": 0.66719091, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.68978941, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.557152271270752 + }, + { + "auxiliary_loss_clip": 0.01119527, + "auxiliary_loss_mlp": 0.01107194, + "balance_loss_clip": 1.00181425, + "balance_loss_mlp": 1.00066829, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.553662669702844, + "language_loss": 0.76885867, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.79112589, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.5900511741638184 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01105994, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00061321, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.946767641703895, + "language_loss": 0.74316609, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76556253, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 4.010998964309692 + }, + { + "auxiliary_loss_clip": 0.01166098, + "auxiliary_loss_mlp": 0.01106894, + "balance_loss_clip": 1.00200951, + "balance_loss_mlp": 1.00055909, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.5758537076560506, + "language_loss": 0.68954784, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71227777, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.506737470626831 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.01107028, + "balance_loss_clip": 1.00170255, + "balance_loss_mlp": 1.00050235, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.229501117730008, + "language_loss": 0.68769872, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.71010864, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 4.053730249404907 + }, + { + "auxiliary_loss_clip": 0.01147258, + "auxiliary_loss_mlp": 0.01084687, + "balance_loss_clip": 1.00126576, + "balance_loss_mlp": 1.00000048, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6520355227010161, + "language_loss": 0.51145506, + "learning_rate": 1.087320141976297e-06, + "loss": 0.5337745, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.0736474990844727 + }, + { + "auxiliary_loss_clip": 0.011661, + "auxiliary_loss_mlp": 0.00747536, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00066411, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.4387792754267914, + "language_loss": 0.70934856, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72848487, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.514479875564575 + }, + { + "auxiliary_loss_clip": 0.01130824, + "auxiliary_loss_mlp": 0.01106043, + "balance_loss_clip": 1.00205171, + "balance_loss_mlp": 1.00056684, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.797021273317837, + "language_loss": 0.65101767, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67338634, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 2.674607753753662 + }, + { + "auxiliary_loss_clip": 0.0116588, + "auxiliary_loss_mlp": 0.01106467, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00041866, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.8139818890543145, + "language_loss": 0.73100948, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75373292, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 3.9472920894622803 + }, + { + "auxiliary_loss_clip": 0.01151185, + "auxiliary_loss_mlp": 0.01106524, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 1.7826066243638574, + "language_loss": 0.78724599, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80982304, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.52866268157959 + }, + { + "auxiliary_loss_clip": 0.01148947, + "auxiliary_loss_mlp": 0.01107738, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00044942, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.852884136948558, + "language_loss": 0.68845737, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71102417, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.5260186195373535 + }, + { + "auxiliary_loss_clip": 0.01149364, + "auxiliary_loss_mlp": 0.01107898, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.0006094, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.13156802313981, + "language_loss": 0.69898379, + "learning_rate": 1.085241494478132e-06, + "loss": 0.72155637, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.4997901916503906 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01105803, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00051785, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.513418011986641, + "language_loss": 0.78331065, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80571866, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.595750093460083 + }, + { + "auxiliary_loss_clip": 0.01149293, + "auxiliary_loss_mlp": 0.01107532, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00062513, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5633107473992665, + "language_loss": 0.76332724, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78589547, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.5566635131835938 + }, + { + "auxiliary_loss_clip": 0.01149122, + "auxiliary_loss_mlp": 0.01106555, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.0004108, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.5284276310391638, + "language_loss": 0.78473151, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80728829, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.546348810195923 + }, + { + "auxiliary_loss_clip": 0.01166154, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00058317, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.818081351496747, + "language_loss": 0.8192569, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.84200007, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.4946353435516357 + }, + { + "auxiliary_loss_clip": 0.01126425, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_clip": 1.00160933, + "balance_loss_mlp": 1.00016963, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 1.0436771391610347, + "language_loss": 0.67341131, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69553179, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.0921967029571533 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.01107085, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00046396, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.656438517100361, + "language_loss": 0.71175647, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73431915, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 2.582326889038086 + }, + { + "auxiliary_loss_clip": 0.01149816, + "auxiliary_loss_mlp": 0.01106589, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00063586, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.4190074380090536, + "language_loss": 0.7235443, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74610835, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.564420461654663 + }, + { + "auxiliary_loss_clip": 0.01151039, + "auxiliary_loss_mlp": 0.01106747, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00079417, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.7788456477408976, + "language_loss": 0.79342812, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81600595, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.583465576171875 + }, + { + "auxiliary_loss_clip": 0.011347, + "auxiliary_loss_mlp": 0.01106281, + "balance_loss_clip": 1.00197661, + "balance_loss_mlp": 1.0005188, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 3.706939108367682, + "language_loss": 0.70318478, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72559464, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.560828924179077 + }, + { + "auxiliary_loss_clip": 0.01132693, + "auxiliary_loss_mlp": 0.00747366, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00055695, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.7057711051680469, + "language_loss": 0.76950622, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78830683, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.5985264778137207 + }, + { + "auxiliary_loss_clip": 0.01151169, + "auxiliary_loss_mlp": 0.00747435, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00061178, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.8564024614151273, + "language_loss": 0.82336235, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8423484, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.521721363067627 + }, + { + "auxiliary_loss_clip": 0.01149689, + "auxiliary_loss_mlp": 0.01107706, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00051296, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 2.2562527892112096, + "language_loss": 0.69867408, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.72124803, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.5406696796417236 + }, + { + "auxiliary_loss_clip": 0.01134829, + "auxiliary_loss_mlp": 0.0110705, + "balance_loss_clip": 1.00200331, + "balance_loss_mlp": 1.00071585, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.8583012656018385, + "language_loss": 0.7776562, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.80007499, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.80349063873291 + }, + { + "auxiliary_loss_clip": 0.01134394, + "auxiliary_loss_mlp": 0.01105618, + "balance_loss_clip": 1.00164914, + "balance_loss_mlp": 1.00061882, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 1.968788116015683, + "language_loss": 0.83154476, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85394484, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.55012845993042 + }, + { + "auxiliary_loss_clip": 0.01150921, + "auxiliary_loss_mlp": 0.00747491, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00062788, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.5994598368253252, + "language_loss": 0.72126269, + "learning_rate": 1.080050345253328e-06, + "loss": 0.74024689, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.56893253326416 + }, + { + "auxiliary_loss_clip": 0.01134995, + "auxiliary_loss_mlp": 0.01107255, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00034833, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.6249733581579209, + "language_loss": 0.72446048, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74688298, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.6079447269439697 + }, + { + "auxiliary_loss_clip": 0.01134699, + "auxiliary_loss_mlp": 0.01108236, + "balance_loss_clip": 1.00172055, + "balance_loss_mlp": 1.00047052, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 1.9917743289070542, + "language_loss": 0.82962954, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85205889, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.5432486534118652 + }, + { + "auxiliary_loss_clip": 0.01134181, + "auxiliary_loss_mlp": 0.0110803, + "balance_loss_clip": 1.00174856, + "balance_loss_mlp": 1.00036049, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.2684628233833264, + "language_loss": 0.72452444, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74694651, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.522827625274658 + }, + { + "auxiliary_loss_clip": 0.01120714, + "auxiliary_loss_mlp": 0.01106936, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.0004102, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 2.3910430846712014, + "language_loss": 0.75242007, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77469647, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.634718656539917 + }, + { + "auxiliary_loss_clip": 0.01117922, + "auxiliary_loss_mlp": 0.01107635, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00044155, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.5029801067675144, + "language_loss": 0.69238102, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71463656, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 4.0154335498809814 + }, + { + "auxiliary_loss_clip": 0.01166115, + "auxiliary_loss_mlp": 0.01107602, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.0005995, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.4565467482970091, + "language_loss": 0.79096907, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.81370628, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.5368595123291016 + }, + { + "auxiliary_loss_clip": 0.01150786, + "auxiliary_loss_mlp": 0.01107085, + "balance_loss_clip": 1.00194204, + "balance_loss_mlp": 1.00055957, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.8046747575843218, + "language_loss": 0.76022232, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78280097, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.540877103805542 + }, + { + "auxiliary_loss_clip": 0.01134876, + "auxiliary_loss_mlp": 0.01107635, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00053751, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.0920156728746817, + "language_loss": 0.6960184, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.71844351, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.576975107192993 + }, + { + "auxiliary_loss_clip": 0.01149145, + "auxiliary_loss_mlp": 0.01106468, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00060987, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.8452653413333637, + "language_loss": 0.7964226, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.81897873, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.5531318187713623 + }, + { + "auxiliary_loss_clip": 0.01166048, + "auxiliary_loss_mlp": 0.0110788, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00059175, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.7923559595297625, + "language_loss": 0.76671672, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78945601, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.4807121753692627 + }, + { + "auxiliary_loss_clip": 0.01150688, + "auxiliary_loss_mlp": 0.01108026, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00045156, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.2619825749862676, + "language_loss": 0.74917156, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77175868, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.5231573581695557 + }, + { + "auxiliary_loss_clip": 0.01149366, + "auxiliary_loss_mlp": 0.01107851, + "balance_loss_clip": 1.00186169, + "balance_loss_mlp": 1.00065851, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.099864342991465, + "language_loss": 0.74849868, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77107084, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.482361078262329 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01106169, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00040698, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 2.118464761119824, + "language_loss": 0.80330002, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82540619, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.675546884536743 + }, + { + "auxiliary_loss_clip": 0.01134852, + "auxiliary_loss_mlp": 0.011074, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00049257, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.5849841449128574, + "language_loss": 0.80407989, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82650244, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 3.9821197986602783 + }, + { + "auxiliary_loss_clip": 0.01149282, + "auxiliary_loss_mlp": 0.01106613, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00056469, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5588769206295066, + "language_loss": 0.75500983, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77756882, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.570178985595703 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.01106899, + "balance_loss_clip": 1.00181019, + "balance_loss_mlp": 1.00037384, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.793206757360663, + "language_loss": 0.82955223, + "learning_rate": 1.074521771867622e-06, + "loss": 0.8518014, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 3.9672539234161377 + }, + { + "auxiliary_loss_clip": 0.01161618, + "auxiliary_loss_mlp": 0.01085098, + "balance_loss_clip": 1.00117791, + "balance_loss_mlp": 1.00003052, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7734475891157979, + "language_loss": 0.52333009, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54579723, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 3.0898044109344482 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01108047, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.0006634, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.5688796389562523, + "language_loss": 0.789783, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81187576, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.7167398929595947 + }, + { + "auxiliary_loss_clip": 0.01117265, + "auxiliary_loss_mlp": 0.01107898, + "balance_loss_clip": 1.00170493, + "balance_loss_mlp": 1.00070536, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 2.1143978930426064, + "language_loss": 0.64320385, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66545546, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 4.128339052200317 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.01107468, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0003705, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.5283065352318297, + "language_loss": 0.63841093, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66064245, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 2.6300463676452637 + }, + { + "auxiliary_loss_clip": 0.01117436, + "auxiliary_loss_mlp": 0.01106822, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.7497630698990156, + "language_loss": 0.71996433, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.74220693, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.6017847061157227 + }, + { + "auxiliary_loss_clip": 0.01150951, + "auxiliary_loss_mlp": 0.01107706, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00079942, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.1211805301773405, + "language_loss": 0.6161195, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63870603, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.6088321208953857 + }, + { + "auxiliary_loss_clip": 0.0114919, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_clip": 1.0017854, + "balance_loss_mlp": 1.00042319, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.332678060247673, + "language_loss": 0.68654096, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70911193, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.592498540878296 + }, + { + "auxiliary_loss_clip": 0.01149528, + "auxiliary_loss_mlp": 0.01105317, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00060415, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.5427851871239036, + "language_loss": 0.83869135, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86123985, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.5840423107147217 + }, + { + "auxiliary_loss_clip": 0.01118055, + "auxiliary_loss_mlp": 0.01108002, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00052285, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 1.9225941537052422, + "language_loss": 0.692927, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71518761, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.5762226581573486 + }, + { + "auxiliary_loss_clip": 0.01149328, + "auxiliary_loss_mlp": 0.01108228, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00065386, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.5122091019879969, + "language_loss": 0.64346069, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66603619, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.5634751319885254 + }, + { + "auxiliary_loss_clip": 0.01119271, + "auxiliary_loss_mlp": 0.01106407, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00054884, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.5377085141920424, + "language_loss": 0.71111548, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73337227, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.7518246173858643 + }, + { + "auxiliary_loss_clip": 0.01082685, + "auxiliary_loss_mlp": 0.01108067, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00068378, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.7434805000496076, + "language_loss": 0.77192265, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79383016, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 2.6598587036132812 + }, + { + "auxiliary_loss_clip": 0.01115558, + "auxiliary_loss_mlp": 0.01085061, + "balance_loss_clip": 1.00116849, + "balance_loss_mlp": 0.9999935, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7891407355400335, + "language_loss": 0.55084044, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57284665, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.2211501598358154 + }, + { + "auxiliary_loss_clip": 0.01149365, + "auxiliary_loss_mlp": 0.01107013, + "balance_loss_clip": 1.00200927, + "balance_loss_mlp": 1.0004878, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.9579967419940112, + "language_loss": 0.63799995, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66056371, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.6296029090881348 + }, + { + "auxiliary_loss_clip": 0.01166063, + "auxiliary_loss_mlp": 0.01107287, + "balance_loss_clip": 1.00197685, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.0543608983667965, + "language_loss": 0.78933072, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81206423, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.5105557441711426 + }, + { + "auxiliary_loss_clip": 0.01132672, + "auxiliary_loss_mlp": 0.01107578, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00057554, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 2.844407017897393, + "language_loss": 0.85277736, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87517989, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.5717716217041016 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.0110844, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.0006752, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.1652167901290738, + "language_loss": 0.7458775, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76799387, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.638770580291748 + }, + { + "auxiliary_loss_clip": 0.01130345, + "auxiliary_loss_mlp": 0.01106878, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00044751, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.5217031541342496, + "language_loss": 0.79687709, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81924933, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.6005516052246094 + }, + { + "auxiliary_loss_clip": 0.0110082, + "auxiliary_loss_mlp": 0.01106995, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00046921, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.507626138524542, + "language_loss": 0.73944128, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76151943, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.62852144241333 + }, + { + "auxiliary_loss_clip": 0.01118302, + "auxiliary_loss_mlp": 0.01107811, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00061798, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 2.0689213821144232, + "language_loss": 0.73060262, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75286376, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.5963454246520996 + }, + { + "auxiliary_loss_clip": 0.01101543, + "auxiliary_loss_mlp": 0.01107305, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00039792, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 1.6928146413363414, + "language_loss": 0.69599128, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71807981, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.664790391921997 + }, + { + "auxiliary_loss_clip": 0.01149251, + "auxiliary_loss_mlp": 0.01107619, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00052178, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 1.6970440688034238, + "language_loss": 0.803931, + "learning_rate": 1.066934663776291e-06, + "loss": 0.8264997, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.5484061241149902 + }, + { + "auxiliary_loss_clip": 0.01113654, + "auxiliary_loss_mlp": 0.01085677, + "balance_loss_clip": 1.00111103, + "balance_loss_mlp": 1.00022817, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.7897960919279556, + "language_loss": 0.62701988, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64901316, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.088855028152466 + }, + { + "auxiliary_loss_clip": 0.01148762, + "auxiliary_loss_mlp": 0.01106512, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.0005585, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.5629822883293498, + "language_loss": 0.78726649, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80981928, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.548149347305298 + }, + { + "auxiliary_loss_clip": 0.01118192, + "auxiliary_loss_mlp": 0.01106751, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00060749, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.5256833421961802, + "language_loss": 0.78859305, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81084251, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.5801289081573486 + }, + { + "auxiliary_loss_clip": 0.01132934, + "auxiliary_loss_mlp": 0.01106268, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00041008, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 1.9036525480430642, + "language_loss": 0.56407255, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.58646458, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.5232748985290527 + }, + { + "auxiliary_loss_clip": 0.01151064, + "auxiliary_loss_mlp": 0.01108156, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00039101, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 2.631620234688105, + "language_loss": 0.75911421, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78170639, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 3.8927958011627197 + }, + { + "auxiliary_loss_clip": 0.01085263, + "auxiliary_loss_mlp": 0.01106812, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00066781, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.2717000679116877, + "language_loss": 0.70600545, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72792619, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.731980562210083 + }, + { + "auxiliary_loss_clip": 0.01161656, + "auxiliary_loss_mlp": 0.01085088, + "balance_loss_clip": 1.00121939, + "balance_loss_mlp": 1.00002003, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8562471010787971, + "language_loss": 0.63022703, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65269446, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.025613784790039 + }, + { + "auxiliary_loss_clip": 0.01151176, + "auxiliary_loss_mlp": 0.01107686, + "balance_loss_clip": 1.00201547, + "balance_loss_mlp": 1.00049257, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.6666201506094669, + "language_loss": 0.62798929, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.6505779, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.541792154312134 + }, + { + "auxiliary_loss_clip": 0.01119531, + "auxiliary_loss_mlp": 0.01107659, + "balance_loss_clip": 1.00170481, + "balance_loss_mlp": 1.00046659, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.438437649601456, + "language_loss": 0.69971466, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72198653, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.71022891998291 + }, + { + "auxiliary_loss_clip": 0.01131663, + "auxiliary_loss_mlp": 0.01085082, + "balance_loss_clip": 1.00121462, + "balance_loss_mlp": 1.00001395, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9014275285828711, + "language_loss": 0.72115207, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74331957, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.125664710998535 + }, + { + "auxiliary_loss_clip": 0.01113963, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_clip": 1.00110447, + "balance_loss_mlp": 1.00000858, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7061701362065488, + "language_loss": 0.57820678, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.6001972, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.2945380210876465 + }, + { + "auxiliary_loss_clip": 0.01141024, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_clip": 1.00119352, + "balance_loss_mlp": 0.9999885, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7471662750035203, + "language_loss": 0.63578951, + "learning_rate": 1.062803450204029e-06, + "loss": 0.6580503, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.143927574157715 + }, + { + "auxiliary_loss_clip": 0.01165909, + "auxiliary_loss_mlp": 0.01106935, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00040936, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7735230874260213, + "language_loss": 0.58954835, + "learning_rate": 1.062459413096116e-06, + "loss": 0.61227679, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 4.06880521774292 + }, + { + "auxiliary_loss_clip": 0.0115051, + "auxiliary_loss_mlp": 0.01107997, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00051785, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 3.176641736541102, + "language_loss": 0.72758031, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75016534, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 3.94028639793396 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01107584, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00048614, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 2.4018017009674923, + "language_loss": 0.7075237, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.73009169, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.6770970821380615 + }, + { + "auxiliary_loss_clip": 0.01116955, + "auxiliary_loss_mlp": 0.01107132, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00051153, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.0658595239775366, + "language_loss": 0.56183505, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58407593, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.580784559249878 + }, + { + "auxiliary_loss_clip": 0.01165904, + "auxiliary_loss_mlp": 0.0074755, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00070167, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.9081645982434414, + "language_loss": 0.7217226, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74085712, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.611499786376953 + }, + { + "auxiliary_loss_clip": 0.01149169, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00046301, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.481079016989558, + "language_loss": 0.65844882, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68100369, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.660900831222534 + }, + { + "auxiliary_loss_clip": 0.01135909, + "auxiliary_loss_mlp": 0.01107708, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00051475, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.6321170423511997, + "language_loss": 0.75105906, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.7734952, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 4.018408536911011 + }, + { + "auxiliary_loss_clip": 0.01132766, + "auxiliary_loss_mlp": 0.01106744, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00050509, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 2.277167931814536, + "language_loss": 0.67012441, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.69251955, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.6041171550750732 + }, + { + "auxiliary_loss_clip": 0.01166072, + "auxiliary_loss_mlp": 0.01108219, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.0006448, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.822367007398027, + "language_loss": 0.6962195, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71896243, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.4551236629486084 + }, + { + "auxiliary_loss_clip": 0.0113413, + "auxiliary_loss_mlp": 0.01107394, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.4148731002387656, + "language_loss": 0.80013853, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82255375, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.5808751583099365 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01105918, + "balance_loss_clip": 1.00165725, + "balance_loss_mlp": 1.00044143, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 2.0324065705492633, + "language_loss": 0.78313208, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80536085, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.6137197017669678 + }, + { + "auxiliary_loss_clip": 0.01117566, + "auxiliary_loss_mlp": 0.01107569, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00047112, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.6826534542279625, + "language_loss": 0.79839754, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82064891, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.6462061405181885 + }, + { + "auxiliary_loss_clip": 0.01115092, + "auxiliary_loss_mlp": 0.01106856, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.00071168, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.5143131195726791, + "language_loss": 0.84154898, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86376846, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 2.5933308601379395 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01108229, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00055909, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 1.9551166230338017, + "language_loss": 0.85149658, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87373942, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.587899684906006 + }, + { + "auxiliary_loss_clip": 0.01134583, + "auxiliary_loss_mlp": 0.01106935, + "balance_loss_clip": 1.00176084, + "balance_loss_mlp": 1.00041008, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 3.1205277050677838, + "language_loss": 0.72907835, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75149357, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.572705030441284 + }, + { + "auxiliary_loss_clip": 0.01134095, + "auxiliary_loss_mlp": 0.01107576, + "balance_loss_clip": 1.00186956, + "balance_loss_mlp": 1.00047803, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.8629161336339841, + "language_loss": 0.80419499, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82661164, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.6389739513397217 + }, + { + "auxiliary_loss_clip": 0.01132364, + "auxiliary_loss_mlp": 0.01107619, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00052094, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.9413613095136637, + "language_loss": 0.74800909, + "learning_rate": 1.056959663258702e-06, + "loss": 0.77040893, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 2.6488170623779297 + }, + { + "auxiliary_loss_clip": 0.0114915, + "auxiliary_loss_mlp": 0.01106909, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.0004791, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.7199426195083902, + "language_loss": 0.64716434, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.66972494, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.532193422317505 + }, + { + "auxiliary_loss_clip": 0.0115126, + "auxiliary_loss_mlp": 0.01107568, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.0004704, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 2.3059456139378534, + "language_loss": 0.64582425, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66841257, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.5133190155029297 + }, + { + "auxiliary_loss_clip": 0.01165947, + "auxiliary_loss_mlp": 0.0110758, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.0004828, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.150655378001413, + "language_loss": 0.81075346, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83348876, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.46235990524292 + }, + { + "auxiliary_loss_clip": 0.01134518, + "auxiliary_loss_mlp": 0.01107685, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00049186, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.0019415790431485, + "language_loss": 0.77534497, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79776704, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.582764148712158 + }, + { + "auxiliary_loss_clip": 0.0116596, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00058675, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 2.1198919548556736, + "language_loss": 0.79543304, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81815994, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.4848499298095703 + }, + { + "auxiliary_loss_clip": 0.01117216, + "auxiliary_loss_mlp": 0.01085055, + "balance_loss_clip": 1.00106966, + "balance_loss_mlp": 0.99998683, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.750195336809165, + "language_loss": 0.57696348, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59898621, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.192765712738037 + }, + { + "auxiliary_loss_clip": 0.01166015, + "auxiliary_loss_mlp": 0.01106888, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00045753, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.5716739261524986, + "language_loss": 0.76587975, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78860879, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.520501136779785 + }, + { + "auxiliary_loss_clip": 0.01165908, + "auxiliary_loss_mlp": 0.01107366, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00055408, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 2.700961214428333, + "language_loss": 0.73080957, + "learning_rate": 1.05421321798155e-06, + "loss": 0.7535423, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.5852890014648438 + }, + { + "auxiliary_loss_clip": 0.01151488, + "auxiliary_loss_mlp": 0.01107909, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00062072, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.490394090777997, + "language_loss": 0.73136878, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75396276, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.5033063888549805 + }, + { + "auxiliary_loss_clip": 0.01118165, + "auxiliary_loss_mlp": 0.01106618, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.00066459, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.870433154953575, + "language_loss": 0.64259434, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66484213, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.6233489513397217 + }, + { + "auxiliary_loss_clip": 0.01149213, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.00180352, + "balance_loss_mlp": 1.00054765, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.7823519804569108, + "language_loss": 0.75554633, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77811205, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.523468017578125 + }, + { + "auxiliary_loss_clip": 0.0116611, + "auxiliary_loss_mlp": 0.0110797, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00058627, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.629513120982539, + "language_loss": 0.74427116, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.767012, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.540601968765259 + }, + { + "auxiliary_loss_clip": 0.01150701, + "auxiliary_loss_mlp": 0.01107369, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00065303, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.778947175481244, + "language_loss": 0.78057659, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80315733, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 3.9629929065704346 + }, + { + "auxiliary_loss_clip": 0.01165874, + "auxiliary_loss_mlp": 0.01106984, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00064921, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 1.7178482353619025, + "language_loss": 0.59949887, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62222743, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.522202968597412 + }, + { + "auxiliary_loss_clip": 0.01134587, + "auxiliary_loss_mlp": 0.01108144, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.0005697, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.7389836720632699, + "language_loss": 0.71392167, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73634899, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 2.5868728160858154 + }, + { + "auxiliary_loss_clip": 0.01151217, + "auxiliary_loss_mlp": 0.01107332, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.0004251, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.5054742947546775, + "language_loss": 0.84414744, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86673295, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.56115984916687 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01106914, + "balance_loss_clip": 1.00163889, + "balance_loss_mlp": 1.00048447, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.1317080835642557, + "language_loss": 0.78105533, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80346608, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.5608513355255127 + }, + { + "auxiliary_loss_clip": 0.01102866, + "auxiliary_loss_mlp": 0.01107517, + "balance_loss_clip": 1.00172782, + "balance_loss_mlp": 1.00051475, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7668911132701728, + "language_loss": 0.58363736, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60574126, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.7922792434692383 + }, + { + "auxiliary_loss_clip": 0.01149592, + "auxiliary_loss_mlp": 0.01108774, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.00053155, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 2.0988537791778374, + "language_loss": 0.73287135, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75545502, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.5475833415985107 + }, + { + "auxiliary_loss_clip": 0.01165997, + "auxiliary_loss_mlp": 0.01107459, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00055206, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6332143801920358, + "language_loss": 0.76378858, + "learning_rate": 1.0500978558659e-06, + "loss": 0.7865231, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.5196917057037354 + }, + { + "auxiliary_loss_clip": 0.01135397, + "auxiliary_loss_mlp": 0.01106419, + "balance_loss_clip": 1.00191677, + "balance_loss_mlp": 1.0003705, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.1796291514955404, + "language_loss": 0.89775825, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92017639, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.586092233657837 + }, + { + "auxiliary_loss_clip": 0.0111869, + "auxiliary_loss_mlp": 0.01105904, + "balance_loss_clip": 1.00184286, + "balance_loss_mlp": 1.00042748, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.3596037130252072, + "language_loss": 0.82801461, + "learning_rate": 1.049412465858646e-06, + "loss": 0.8502605, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 4.2697296142578125 + }, + { + "auxiliary_loss_clip": 0.01132641, + "auxiliary_loss_mlp": 0.01107732, + "balance_loss_clip": 1.00166643, + "balance_loss_mlp": 1.00053859, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 4.706173768189869, + "language_loss": 0.69419944, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71660316, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 3.9052670001983643 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01107294, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00057769, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.5159742406120762, + "language_loss": 0.73661894, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75902116, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.6401641368865967 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.0110657, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 1.9804430116707494, + "language_loss": 0.65228504, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67501086, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.4898898601531982 + }, + { + "auxiliary_loss_clip": 0.01134634, + "auxiliary_loss_mlp": 0.01107144, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00042832, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.7274295711502954, + "language_loss": 0.63130432, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65372211, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.555971145629883 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.00062859, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.871667080251921, + "language_loss": 0.6564188, + "learning_rate": 1.047699621879422e-06, + "loss": 0.67849278, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.6300339698791504 + }, + { + "auxiliary_loss_clip": 0.01151166, + "auxiliary_loss_mlp": 0.01107844, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00055587, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.4265912828998564, + "language_loss": 0.78598952, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80857956, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 3.930662155151367 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.00157356, + "balance_loss_mlp": 1.00075591, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 2.0612417323821517, + "language_loss": 0.79631013, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81481099, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.6960885524749756 + }, + { + "auxiliary_loss_clip": 0.01117909, + "auxiliary_loss_mlp": 0.01107566, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00065923, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.9029774608227243, + "language_loss": 0.79054582, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81280065, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.6550543308258057 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01107772, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00057924, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 5.042165324711354, + "language_loss": 0.65645748, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67854112, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.6782617568969727 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01107386, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00038385, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.4329782704520613, + "language_loss": 0.68982553, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71224374, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.605668544769287 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01107111, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00048971, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.5734152012430533, + "language_loss": 0.6744709, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69688845, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.6597166061401367 + }, + { + "auxiliary_loss_clip": 0.0111928, + "auxiliary_loss_mlp": 0.01107452, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00044966, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.579924261692647, + "language_loss": 0.7202127, + "learning_rate": 1.045303157347638e-06, + "loss": 0.74248004, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.665348768234253 + }, + { + "auxiliary_loss_clip": 0.011347, + "auxiliary_loss_mlp": 0.01107657, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00055957, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.5306520870725047, + "language_loss": 0.69847286, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72089642, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.5504708290100098 + }, + { + "auxiliary_loss_clip": 0.0108809, + "auxiliary_loss_mlp": 0.00747383, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.00063252, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.566212745999393, + "language_loss": 0.71680415, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73515886, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.762157917022705 + }, + { + "auxiliary_loss_clip": 0.01134159, + "auxiliary_loss_mlp": 0.0110807, + "balance_loss_clip": 1.00198114, + "balance_loss_mlp": 1.00059104, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 6.892849228871689, + "language_loss": 0.79288971, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81531203, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.6166632175445557 + }, + { + "auxiliary_loss_clip": 0.0113286, + "auxiliary_loss_mlp": 0.01108065, + "balance_loss_clip": 1.00194728, + "balance_loss_mlp": 1.00058556, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 2.017416905687363, + "language_loss": 0.73924863, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76165783, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.602527618408203 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01107556, + "balance_loss_clip": 1.0021112, + "balance_loss_mlp": 1.00064909, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 1.9824606907276046, + "language_loss": 0.66886497, + "learning_rate": 1.043592482774116e-06, + "loss": 0.69113195, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.6247398853302 + }, + { + "auxiliary_loss_clip": 0.01151313, + "auxiliary_loss_mlp": 0.01107541, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00053859, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.6776050195070598, + "language_loss": 0.70846349, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73105204, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.54789662361145 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.0110867, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00052381, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 3.239230422752037, + "language_loss": 0.79921836, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82163143, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.5968332290649414 + }, + { + "auxiliary_loss_clip": 0.01166097, + "auxiliary_loss_mlp": 0.01107516, + "balance_loss_clip": 1.00198281, + "balance_loss_mlp": 1.00051415, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.703688092229311, + "language_loss": 0.80697769, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.82971382, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.4994757175445557 + }, + { + "auxiliary_loss_clip": 0.01151278, + "auxiliary_loss_mlp": 0.01105866, + "balance_loss_clip": 1.0018934, + "balance_loss_mlp": 1.00058031, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.5624475004168692, + "language_loss": 0.70423186, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.7268033, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.6173393726348877 + }, + { + "auxiliary_loss_clip": 0.01135388, + "auxiliary_loss_mlp": 0.01106296, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00062895, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.5353142605742427, + "language_loss": 0.70388937, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.7263062, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.584033489227295 + }, + { + "auxiliary_loss_clip": 0.01149229, + "auxiliary_loss_mlp": 0.01106917, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.0003916, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.2550811017521704, + "language_loss": 0.65258491, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.6751464, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.4958672523498535 + }, + { + "auxiliary_loss_clip": 0.0115149, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00050855, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6103608935747242, + "language_loss": 0.74604475, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76863378, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.5252976417541504 + }, + { + "auxiliary_loss_clip": 0.01149092, + "auxiliary_loss_mlp": 0.01107836, + "balance_loss_clip": 1.00202405, + "balance_loss_mlp": 1.00064301, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 1.8950551496253232, + "language_loss": 0.66394848, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68651772, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.5938045978546143 + }, + { + "auxiliary_loss_clip": 0.0115148, + "auxiliary_loss_mlp": 0.01108717, + "balance_loss_clip": 1.0021106, + "balance_loss_mlp": 1.00057077, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.9144600835082, + "language_loss": 0.77268684, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79528886, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.569596529006958 + }, + { + "auxiliary_loss_clip": 0.01151162, + "auxiliary_loss_mlp": 0.01106782, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00044775, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.6424328867537072, + "language_loss": 0.74159849, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76417792, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.5410139560699463 + }, + { + "auxiliary_loss_clip": 0.01149904, + "auxiliary_loss_mlp": 0.01108001, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00071311, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.523415388338736, + "language_loss": 0.6239481, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64652717, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.5556013584136963 + }, + { + "auxiliary_loss_clip": 0.01165962, + "auxiliary_loss_mlp": 0.01106786, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00054669, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.5838619322743754, + "language_loss": 0.65587354, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67860103, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.53410267829895 + }, + { + "auxiliary_loss_clip": 0.01120324, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.00051928, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.6439326134871415, + "language_loss": 0.7262069, + "learning_rate": 1.039148976175053e-06, + "loss": 0.74846435, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 4.110128402709961 + }, + { + "auxiliary_loss_clip": 0.01119066, + "auxiliary_loss_mlp": 0.01106135, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00056314, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 2.056347001758909, + "language_loss": 0.71041965, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.73267162, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.6542439460754395 + }, + { + "auxiliary_loss_clip": 0.01150772, + "auxiliary_loss_mlp": 0.0110726, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.0004487, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.7222266104974904, + "language_loss": 0.75941211, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78199244, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.6200051307678223 + }, + { + "auxiliary_loss_clip": 0.01149431, + "auxiliary_loss_mlp": 0.01108, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00052071, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7306867497399192, + "language_loss": 0.82039696, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84297121, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.5746166706085205 + }, + { + "auxiliary_loss_clip": 0.01102229, + "auxiliary_loss_mlp": 0.01106421, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.0004673, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.569556164841883, + "language_loss": 0.69757313, + "learning_rate": 1.037782980862959e-06, + "loss": 0.71965963, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.7130613327026367 + }, + { + "auxiliary_loss_clip": 0.01117952, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.00177801, + "balance_loss_mlp": 1.00070524, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.4420765751984266, + "language_loss": 0.701208, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71986234, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.6832151412963867 + }, + { + "auxiliary_loss_clip": 0.01134174, + "auxiliary_loss_mlp": 0.01107167, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00045145, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.5705668952083343, + "language_loss": 0.74627817, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76869166, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.5913338661193848 + }, + { + "auxiliary_loss_clip": 0.01133851, + "auxiliary_loss_mlp": 0.01107102, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00048184, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.4928771719999523, + "language_loss": 0.71012819, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73253775, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 4.013985872268677 + }, + { + "auxiliary_loss_clip": 0.01165923, + "auxiliary_loss_mlp": 0.00747473, + "balance_loss_clip": 1.00199497, + "balance_loss_mlp": 1.00061238, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.847715598140311, + "language_loss": 0.78234994, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80148393, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.4819276332855225 + }, + { + "auxiliary_loss_clip": 0.01150673, + "auxiliary_loss_mlp": 0.00747405, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.0006355, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.5389087955523977, + "language_loss": 0.70043492, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.71941572, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 3.98177170753479 + }, + { + "auxiliary_loss_clip": 0.01135479, + "auxiliary_loss_mlp": 0.01106978, + "balance_loss_clip": 1.00180662, + "balance_loss_mlp": 1.00054836, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6796495185416822, + "language_loss": 0.70355785, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72598249, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.5872910022735596 + }, + { + "auxiliary_loss_clip": 0.01134348, + "auxiliary_loss_mlp": 0.01106144, + "balance_loss_clip": 1.00164378, + "balance_loss_mlp": 1.00047648, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.527502018411333, + "language_loss": 0.73867726, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.76108217, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.6347038745880127 + }, + { + "auxiliary_loss_clip": 0.01149225, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00047183, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.8167731615019689, + "language_loss": 0.78354943, + "learning_rate": 1.035052742460671e-06, + "loss": 0.8061049, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.5542421340942383 + }, + { + "auxiliary_loss_clip": 0.01096671, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_clip": 1.00113702, + "balance_loss_mlp": 1.00003719, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.792690809587692, + "language_loss": 0.5547561, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57657385, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.316119909286499 + }, + { + "auxiliary_loss_clip": 0.01132489, + "auxiliary_loss_mlp": 0.0110693, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00059557, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 3.7703747039876867, + "language_loss": 0.80815721, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83055133, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 4.101595401763916 + }, + { + "auxiliary_loss_clip": 0.0111588, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.00167716, + "balance_loss_mlp": 1.00060546, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 2.0001289905941753, + "language_loss": 0.76092935, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.77956271, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.6351656913757324 + }, + { + "auxiliary_loss_clip": 0.01136343, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00062072, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4322521155323809, + "language_loss": 0.75815356, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78059709, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.582329750061035 + }, + { + "auxiliary_loss_clip": 0.01166134, + "auxiliary_loss_mlp": 0.01106982, + "balance_loss_clip": 1.0020777, + "balance_loss_mlp": 1.00074315, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 1.7861801332878529, + "language_loss": 0.82355982, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84629101, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.542177200317383 + }, + { + "auxiliary_loss_clip": 0.01165808, + "auxiliary_loss_mlp": 0.01106935, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00050461, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 1.786611397718776, + "language_loss": 0.74768138, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77040881, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 2.494702100753784 + }, + { + "auxiliary_loss_clip": 0.01151228, + "auxiliary_loss_mlp": 0.01107951, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00056756, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.6642216329509527, + "language_loss": 0.7436955, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76628727, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.5781121253967285 + }, + { + "auxiliary_loss_clip": 0.01165898, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00061321, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.46703048874863, + "language_loss": 0.8184303, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.84116256, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.526334047317505 + }, + { + "auxiliary_loss_clip": 0.01134499, + "auxiliary_loss_mlp": 0.01106797, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.0005579, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.5906530116162432, + "language_loss": 0.76856947, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79098248, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.5458853244781494 + }, + { + "auxiliary_loss_clip": 0.01132392, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_clip": 1.0018127, + "balance_loss_mlp": 1.00038409, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 2.9511994880223305, + "language_loss": 0.73498523, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75737345, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.59177827835083 + }, + { + "auxiliary_loss_clip": 0.01134827, + "auxiliary_loss_mlp": 0.01107591, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.0006845, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 2.705729507773006, + "language_loss": 0.68388069, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70630491, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.617337465286255 + }, + { + "auxiliary_loss_clip": 0.01136463, + "auxiliary_loss_mlp": 0.01106771, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00072265, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.6457031899192187, + "language_loss": 0.6995213, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72195363, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.5429184436798096 + }, + { + "auxiliary_loss_clip": 0.01165911, + "auxiliary_loss_mlp": 0.01106311, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00054836, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.8414151544286599, + "language_loss": 0.75629342, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.7790156, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 2.5407347679138184 + }, + { + "auxiliary_loss_clip": 0.01165968, + "auxiliary_loss_mlp": 0.01107169, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00054812, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 1.882245808707638, + "language_loss": 0.65041298, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67314434, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.5020644664764404 + }, + { + "auxiliary_loss_clip": 0.0116597, + "auxiliary_loss_mlp": 0.01106512, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00046384, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.0150021667931317, + "language_loss": 0.7131831, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73590791, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.5032875537872314 + }, + { + "auxiliary_loss_clip": 0.01165828, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00043201, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.7192536420057198, + "language_loss": 0.76979673, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79251885, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 2.5176827907562256 + }, + { + "auxiliary_loss_clip": 0.01151043, + "auxiliary_loss_mlp": 0.01107667, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00056934, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.9352312409978047, + "language_loss": 0.68958366, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71217072, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.641166925430298 + }, + { + "auxiliary_loss_clip": 0.01116478, + "auxiliary_loss_mlp": 0.01108099, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00071478, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 1.77346827377957, + "language_loss": 0.73293221, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75517797, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.6505134105682373 + }, + { + "auxiliary_loss_clip": 0.01150705, + "auxiliary_loss_mlp": 0.01108126, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00055122, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.1345900692547226, + "language_loss": 0.76104712, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78363538, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.491151809692383 + }, + { + "auxiliary_loss_clip": 0.01132527, + "auxiliary_loss_mlp": 0.01107156, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00043988, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 1.7224165609649162, + "language_loss": 0.74222428, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76462108, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.534282684326172 + }, + { + "auxiliary_loss_clip": 0.01119705, + "auxiliary_loss_mlp": 0.01107251, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.0006299, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.6554652242193548, + "language_loss": 0.86252576, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88479531, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.5843260288238525 + }, + { + "auxiliary_loss_clip": 0.01151048, + "auxiliary_loss_mlp": 0.0110702, + "balance_loss_clip": 1.00183117, + "balance_loss_mlp": 1.00049484, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.6047605096538826, + "language_loss": 0.63539064, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65797132, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.5407490730285645 + }, + { + "auxiliary_loss_clip": 0.01150954, + "auxiliary_loss_mlp": 0.01108743, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00059688, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.2715620178013047, + "language_loss": 0.71788752, + "learning_rate": 1.02721637475002e-06, + "loss": 0.74048448, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.5048718452453613 + }, + { + "auxiliary_loss_clip": 0.01116263, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00055861, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.901193360496324, + "language_loss": 0.68884236, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.71107203, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.01130601, + "auxiliary_loss_mlp": 0.01106224, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00065231, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 1.8498792253674712, + "language_loss": 0.7368685, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75923669, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 3.9611189365386963 + }, + { + "auxiliary_loss_clip": 0.01132432, + "auxiliary_loss_mlp": 0.01106613, + "balance_loss_clip": 1.00170004, + "balance_loss_mlp": 1.00046897, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.6811673060528598, + "language_loss": 0.72855908, + "learning_rate": 1.026195675108182e-06, + "loss": 0.7509495, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.5837900638580322 + }, + { + "auxiliary_loss_clip": 0.01165972, + "auxiliary_loss_mlp": 0.01106938, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00069845, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 1.9970294257898151, + "language_loss": 0.7663722, + "learning_rate": 1.025855515730551e-06, + "loss": 0.7891013, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.546311140060425 + }, + { + "auxiliary_loss_clip": 0.0114929, + "auxiliary_loss_mlp": 0.01106226, + "balance_loss_clip": 1.00198042, + "balance_loss_mlp": 1.00046372, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.5814000689394145, + "language_loss": 0.6991936, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72174871, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.5115537643432617 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00048745, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.670364045645402, + "language_loss": 0.74070084, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76277983, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.6808524131774902 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.0110672, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00048089, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.3124272504657042, + "language_loss": 0.75189376, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77432323, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.6307549476623535 + }, + { + "auxiliary_loss_clip": 0.01134287, + "auxiliary_loss_mlp": 0.01106218, + "balance_loss_clip": 1.00170887, + "balance_loss_mlp": 1.00035989, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.2095083566544274, + "language_loss": 0.74444294, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76684797, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.565619945526123 + }, + { + "auxiliary_loss_clip": 0.0114914, + "auxiliary_loss_mlp": 0.01105845, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.0004636, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.7652312912008872, + "language_loss": 0.69467062, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71722049, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.5529367923736572 + }, + { + "auxiliary_loss_clip": 0.01102771, + "auxiliary_loss_mlp": 0.01107018, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00068355, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.5805253709583484, + "language_loss": 0.77679247, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79889041, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 4.060798168182373 + }, + { + "auxiliary_loss_clip": 0.0113453, + "auxiliary_loss_mlp": 0.0074779, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00067639, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 2.646168638155756, + "language_loss": 0.66213071, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68095398, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.586137533187866 + }, + { + "auxiliary_loss_clip": 0.01117838, + "auxiliary_loss_mlp": 0.01107081, + "balance_loss_clip": 1.00177348, + "balance_loss_mlp": 1.00055552, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.5744510637141396, + "language_loss": 0.80503845, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82728761, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 4.107446193695068 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01106172, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.0006007, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.7717815693346437, + "language_loss": 0.80091798, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82347143, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.5576412677764893 + }, + { + "auxiliary_loss_clip": 0.01101326, + "auxiliary_loss_mlp": 0.0110842, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00055933, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.6914485925377787, + "language_loss": 0.70834458, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73044205, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.6840827465057373 + }, + { + "auxiliary_loss_clip": 0.01070425, + "auxiliary_loss_mlp": 0.01106815, + "balance_loss_clip": 1.00156736, + "balance_loss_mlp": 1.00057554, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.667239491774771, + "language_loss": 0.7582711, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.78004354, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 2.760843276977539 + }, + { + "auxiliary_loss_clip": 0.01165976, + "auxiliary_loss_mlp": 0.01108348, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00048721, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.211482544903148, + "language_loss": 0.75381684, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77656013, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.509998321533203 + }, + { + "auxiliary_loss_clip": 0.01084839, + "auxiliary_loss_mlp": 0.01106149, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00057697, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.4638627624036717, + "language_loss": 0.77202737, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79393733, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 4.2116172313690186 + }, + { + "auxiliary_loss_clip": 0.01165831, + "auxiliary_loss_mlp": 0.01106662, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00042319, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 3.2022065837183873, + "language_loss": 0.86377382, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88649869, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.6365294456481934 + }, + { + "auxiliary_loss_clip": 0.01151444, + "auxiliary_loss_mlp": 0.01107819, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.0005306, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 2.042894057233625, + "language_loss": 0.76216674, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.7847594, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.5731985569000244 + }, + { + "auxiliary_loss_clip": 0.01117591, + "auxiliary_loss_mlp": 0.01107147, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00071669, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.7938103978352817, + "language_loss": 0.78859293, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.81084031, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 2.6285207271575928 + }, + { + "auxiliary_loss_clip": 0.01149366, + "auxiliary_loss_mlp": 0.01106813, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00047803, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 1.8972144631017476, + "language_loss": 0.90021247, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.9227742, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.5362765789031982 + }, + { + "auxiliary_loss_clip": 0.01150546, + "auxiliary_loss_mlp": 0.01107443, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00072646, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.6760134088526677, + "language_loss": 0.72386813, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74644804, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.598691940307617 + }, + { + "auxiliary_loss_clip": 0.01076116, + "auxiliary_loss_mlp": 0.01084562, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00025678, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7730753465839613, + "language_loss": 0.56572098, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58732772, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.2654902935028076 + }, + { + "auxiliary_loss_clip": 0.01133912, + "auxiliary_loss_mlp": 0.01106963, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.0004375, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.444479555567145, + "language_loss": 0.75189847, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77430725, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 3.0250682830810547 + }, + { + "auxiliary_loss_clip": 0.01150665, + "auxiliary_loss_mlp": 0.01107229, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00041723, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 1.9974370215165693, + "language_loss": 0.81861937, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.84119833, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.5115082263946533 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01107465, + "balance_loss_clip": 1.00158286, + "balance_loss_mlp": 1.00065339, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.66667780015706, + "language_loss": 0.71288621, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73497361, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.7815897464752197 + }, + { + "auxiliary_loss_clip": 0.01166002, + "auxiliary_loss_mlp": 0.01107741, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00064361, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.5298461251537778, + "language_loss": 0.64533424, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66807169, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.8767123222351074 + }, + { + "auxiliary_loss_clip": 0.01132851, + "auxiliary_loss_mlp": 0.01107521, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00061417, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.8237413975953085, + "language_loss": 0.63140047, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65380418, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 2.583508014678955 + }, + { + "auxiliary_loss_clip": 0.01165963, + "auxiliary_loss_mlp": 0.01106958, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00043225, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.637911439479309, + "language_loss": 0.74668407, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76941323, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 2.4818432331085205 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01108517, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00046611, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.573911176768251, + "language_loss": 0.67309517, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.6955179, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.5843544006347656 + }, + { + "auxiliary_loss_clip": 0.01149435, + "auxiliary_loss_mlp": 0.01107348, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00044107, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 2.164497344465799, + "language_loss": 0.74010241, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76267016, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.542771816253662 + }, + { + "auxiliary_loss_clip": 0.01165691, + "auxiliary_loss_mlp": 0.01106401, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00044787, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.5934831537336522, + "language_loss": 0.71745634, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.74017727, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 2.5658304691314697 + }, + { + "auxiliary_loss_clip": 0.01117942, + "auxiliary_loss_mlp": 0.00747626, + "balance_loss_clip": 1.00203466, + "balance_loss_mlp": 1.00062656, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 1.8723965063376489, + "language_loss": 0.67290115, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69155681, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.6524229049682617 + }, + { + "auxiliary_loss_clip": 0.01102456, + "auxiliary_loss_mlp": 0.01105852, + "balance_loss_clip": 1.00169933, + "balance_loss_mlp": 1.00047064, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.5319549810693234, + "language_loss": 0.73402882, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75611192, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.644712209701538 + }, + { + "auxiliary_loss_clip": 0.01150843, + "auxiliary_loss_mlp": 0.01107366, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00055444, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 2.072249738741312, + "language_loss": 0.75682485, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77940691, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.578977346420288 + }, + { + "auxiliary_loss_clip": 0.01115126, + "auxiliary_loss_mlp": 0.01105261, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00045192, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.604249575367824, + "language_loss": 0.66295797, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68516183, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.683039903640747 + }, + { + "auxiliary_loss_clip": 0.01165752, + "auxiliary_loss_mlp": 0.01105645, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00045478, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.0148574675251054, + "language_loss": 0.80670226, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82941622, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.504211187362671 + }, + { + "auxiliary_loss_clip": 0.01101418, + "auxiliary_loss_mlp": 0.01106345, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00039148, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.3295992894421675, + "language_loss": 0.76629996, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78837752, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.686410665512085 + }, + { + "auxiliary_loss_clip": 0.01117443, + "auxiliary_loss_mlp": 0.00747701, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00075376, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.78963501413916, + "language_loss": 0.77774358, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.79639506, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 4.012991428375244 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01107243, + "balance_loss_clip": 1.00183237, + "balance_loss_mlp": 1.00052643, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 2.9661544536251627, + "language_loss": 0.6787014, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.70075929, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 2.6576340198516846 + }, + { + "auxiliary_loss_clip": 0.01166079, + "auxiliary_loss_mlp": 0.00747723, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00074983, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.7182399709595217, + "language_loss": 0.72605026, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74518836, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.6243464946746826 + }, + { + "auxiliary_loss_clip": 0.01151055, + "auxiliary_loss_mlp": 0.00747558, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.0005548, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 1.763771367616294, + "language_loss": 0.67150283, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69048893, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.701864719390869 + }, + { + "auxiliary_loss_clip": 0.01161656, + "auxiliary_loss_mlp": 0.01084875, + "balance_loss_clip": 1.00126362, + "balance_loss_mlp": 1.00018907, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6724569528180641, + "language_loss": 0.56278872, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58525407, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.23008131980896 + }, + { + "auxiliary_loss_clip": 0.01149111, + "auxiliary_loss_mlp": 0.01106469, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00042057, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.7476024396965988, + "language_loss": 0.7427156, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76527137, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.5660791397094727 + }, + { + "auxiliary_loss_clip": 0.01115706, + "auxiliary_loss_mlp": 0.01107209, + "balance_loss_clip": 1.00173461, + "balance_loss_mlp": 1.00068331, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6001809298716618, + "language_loss": 0.65747499, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.67970413, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.619131326675415 + }, + { + "auxiliary_loss_clip": 0.0111961, + "auxiliary_loss_mlp": 0.01107916, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00053215, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.6219777543140301, + "language_loss": 0.74967682, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77195203, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.6522088050842285 + }, + { + "auxiliary_loss_clip": 0.01149533, + "auxiliary_loss_mlp": 0.01107519, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.0006125, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.621363008652726, + "language_loss": 0.70137775, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72394824, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.582028865814209 + }, + { + "auxiliary_loss_clip": 0.01119151, + "auxiliary_loss_mlp": 0.01107352, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00044525, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.7137461532883531, + "language_loss": 0.5790813, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60134625, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 4.252781629562378 + }, + { + "auxiliary_loss_clip": 0.0114924, + "auxiliary_loss_mlp": 0.01107812, + "balance_loss_clip": 1.0019592, + "balance_loss_mlp": 1.00052404, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 1.634239693185576, + "language_loss": 0.7646699, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78724045, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 3.9490294456481934 + }, + { + "auxiliary_loss_clip": 0.01151234, + "auxiliary_loss_mlp": 0.0110765, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00055289, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.6369155952430319, + "language_loss": 0.75044477, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77303362, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.5286476612091064 + }, + { + "auxiliary_loss_clip": 0.01099139, + "auxiliary_loss_mlp": 0.01106712, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00047314, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6107532787886358, + "language_loss": 0.6345942, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65665269, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.6471610069274902 + }, + { + "auxiliary_loss_clip": 0.0116566, + "auxiliary_loss_mlp": 0.00747474, + "balance_loss_clip": 1.00184596, + "balance_loss_mlp": 1.00059235, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.8399944232681835, + "language_loss": 0.64101213, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66014349, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.4886350631713867 + }, + { + "auxiliary_loss_clip": 0.01151168, + "auxiliary_loss_mlp": 0.01107176, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00045979, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.3693268142199955, + "language_loss": 0.71923184, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.74181527, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.500606060028076 + }, + { + "auxiliary_loss_clip": 0.01133708, + "auxiliary_loss_mlp": 0.01106937, + "balance_loss_clip": 1.00178516, + "balance_loss_mlp": 1.00041127, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 2.1697241461989645, + "language_loss": 0.71022773, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73263419, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 4.030470132827759 + }, + { + "auxiliary_loss_clip": 0.0113116, + "auxiliary_loss_mlp": 0.01084476, + "balance_loss_clip": 1.00169516, + "balance_loss_mlp": 1.00017118, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7532690582313813, + "language_loss": 0.53259939, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55475575, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.244072198867798 + }, + { + "auxiliary_loss_clip": 0.01150819, + "auxiliary_loss_mlp": 0.0110631, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00064337, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.6730364250026453, + "language_loss": 0.80338287, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.8259542, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.5580830574035645 + }, + { + "auxiliary_loss_clip": 0.01134557, + "auxiliary_loss_mlp": 0.01106248, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00038993, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.6238958934688477, + "language_loss": 0.6571185, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67952657, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.583195924758911 + }, + { + "auxiliary_loss_clip": 0.0109946, + "auxiliary_loss_mlp": 0.01107731, + "balance_loss_clip": 1.00167394, + "balance_loss_mlp": 1.00053847, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 3.6064276891280733, + "language_loss": 0.66707486, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68914676, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.743123769760132 + }, + { + "auxiliary_loss_clip": 0.01118647, + "auxiliary_loss_mlp": 0.01106233, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00047016, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.6002610467541103, + "language_loss": 0.72375798, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74600685, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 2.6153712272644043 + }, + { + "auxiliary_loss_clip": 0.01151077, + "auxiliary_loss_mlp": 0.01107555, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00055242, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.4940833433189917, + "language_loss": 0.76906753, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79165375, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.581585168838501 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01107585, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.00058293, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.565279214789256, + "language_loss": 0.7513603, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77409565, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.5453152656555176 + }, + { + "auxiliary_loss_clip": 0.01146629, + "auxiliary_loss_mlp": 0.01084794, + "balance_loss_clip": 1.00114655, + "balance_loss_mlp": 1.00010824, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7809910360430955, + "language_loss": 0.51354778, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53586197, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.073796272277832 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_clip": 1.00173736, + "balance_loss_mlp": 1.00047958, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 1.877313947396848, + "language_loss": 0.75826132, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.78051531, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 2.634749174118042 + }, + { + "auxiliary_loss_clip": 0.01134528, + "auxiliary_loss_mlp": 0.01107284, + "balance_loss_clip": 1.00205827, + "balance_loss_mlp": 1.00056815, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 1.8777302720291214, + "language_loss": 0.77449375, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79691184, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.6396985054016113 + }, + { + "auxiliary_loss_clip": 0.01151136, + "auxiliary_loss_mlp": 0.01107773, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.0005796, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.6699609584929132, + "language_loss": 0.66560602, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68819517, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.5931525230407715 + }, + { + "auxiliary_loss_clip": 0.01132412, + "auxiliary_loss_mlp": 0.01106117, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00054502, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.1184922738231964, + "language_loss": 0.83230549, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85469079, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.543649435043335 + }, + { + "auxiliary_loss_clip": 0.01119455, + "auxiliary_loss_mlp": 0.01109253, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00043929, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 3.21640216504473, + "language_loss": 0.74340755, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76569462, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 2.6189215183258057 + }, + { + "auxiliary_loss_clip": 0.01098905, + "auxiliary_loss_mlp": 0.0110745, + "balance_loss_clip": 1.00202572, + "balance_loss_mlp": 1.00054312, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 3.070616593289849, + "language_loss": 0.80319989, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82526338, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.630560874938965 + }, + { + "auxiliary_loss_clip": 0.01150868, + "auxiliary_loss_mlp": 0.01106763, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00042844, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.671461388856093, + "language_loss": 0.72717327, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.7497496, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.589327096939087 + }, + { + "auxiliary_loss_clip": 0.01149062, + "auxiliary_loss_mlp": 0.01105873, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.00068319, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.5504446437119785, + "language_loss": 0.72611356, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74866295, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.539113759994507 + }, + { + "auxiliary_loss_clip": 0.01166018, + "auxiliary_loss_mlp": 0.0110741, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.0005033, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8584261749100566, + "language_loss": 0.85729074, + "learning_rate": 1.003149631190393e-06, + "loss": 0.88002497, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.505173683166504 + }, + { + "auxiliary_loss_clip": 0.01166124, + "auxiliary_loss_mlp": 0.00747822, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00078177, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.8037722203887854, + "language_loss": 0.74037129, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75951076, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.5167112350463867 + }, + { + "auxiliary_loss_clip": 0.01149186, + "auxiliary_loss_mlp": 0.01106614, + "balance_loss_clip": 1.00188506, + "balance_loss_mlp": 1.00047016, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.9362663986263655, + "language_loss": 0.87910652, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.524764060974121 + }, + { + "auxiliary_loss_clip": 0.01131232, + "auxiliary_loss_mlp": 0.0108487, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 1.00018334, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8218032034105467, + "language_loss": 0.54059893, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56275994, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.1693689823150635 + }, + { + "auxiliary_loss_clip": 0.01085163, + "auxiliary_loss_mlp": 0.01106101, + "balance_loss_clip": 1.00182033, + "balance_loss_mlp": 1.00062513, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.5157690724326212, + "language_loss": 0.73248428, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75439692, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.7141265869140625 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01107833, + "balance_loss_clip": 1.00175476, + "balance_loss_mlp": 1.00054443, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 1.8462468604082376, + "language_loss": 0.73677367, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75934416, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.5033817291259766 + }, + { + "auxiliary_loss_clip": 0.01165956, + "auxiliary_loss_mlp": 0.0110703, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00050426, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.8389810220933664, + "language_loss": 0.75229025, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77502006, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 3.874207019805908 + }, + { + "auxiliary_loss_clip": 0.01118232, + "auxiliary_loss_mlp": 0.01106704, + "balance_loss_clip": 1.0020386, + "balance_loss_mlp": 1.0004642, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.8196442938851354, + "language_loss": 0.69947052, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72171992, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 2.602623701095581 + }, + { + "auxiliary_loss_clip": 0.01099356, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00060868, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7606527172511215, + "language_loss": 0.6708138, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69287586, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.705899953842163 + }, + { + "auxiliary_loss_clip": 0.0111976, + "auxiliary_loss_mlp": 0.00747627, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00067317, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5242873828449972, + "language_loss": 0.77089065, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.78956449, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.632333993911743 + }, + { + "auxiliary_loss_clip": 0.01149165, + "auxiliary_loss_mlp": 0.0110654, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00049114, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.136595780950676, + "language_loss": 0.72028816, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74284518, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.5323331356048584 + }, + { + "auxiliary_loss_clip": 0.0110286, + "auxiliary_loss_mlp": 0.0074752, + "balance_loss_clip": 1.00174725, + "balance_loss_mlp": 1.0005734, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.455302583120175, + "language_loss": 0.74981916, + "learning_rate": 9.994379131600828e-07, + "loss": 0.76832294, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.700505495071411 + }, + { + "auxiliary_loss_clip": 0.01149273, + "auxiliary_loss_mlp": 0.01107411, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00059903, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.017087090520325, + "language_loss": 0.65026295, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67282975, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.503791093826294 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01106854, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00051963, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.3874215974109465, + "language_loss": 0.75781882, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77990973, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.661159038543701 + }, + { + "auxiliary_loss_clip": 0.01132178, + "auxiliary_loss_mlp": 0.01105527, + "balance_loss_clip": 1.0017488, + "balance_loss_mlp": 1.00043201, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.8175505932715166, + "language_loss": 0.66809201, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69046903, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 4.1364850997924805 + }, + { + "auxiliary_loss_clip": 0.01133898, + "auxiliary_loss_mlp": 0.01106497, + "balance_loss_clip": 1.00176871, + "balance_loss_mlp": 1.00054324, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.0287860284310084, + "language_loss": 0.85538995, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87779391, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.5606117248535156 + }, + { + "auxiliary_loss_clip": 0.01134638, + "auxiliary_loss_mlp": 0.01106868, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00062871, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.0729264873682465, + "language_loss": 0.77518821, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79760319, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 3.9498751163482666 + }, + { + "auxiliary_loss_clip": 0.01132666, + "auxiliary_loss_mlp": 0.01106431, + "balance_loss_clip": 1.00161529, + "balance_loss_mlp": 1.00057364, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.7218411121397188, + "language_loss": 0.87888932, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90128034, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.5577921867370605 + }, + { + "auxiliary_loss_clip": 0.01132622, + "auxiliary_loss_mlp": 0.01107537, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00053525, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.016911717630294, + "language_loss": 0.74485755, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76725912, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.56245493888855 + }, + { + "auxiliary_loss_clip": 0.01151286, + "auxiliary_loss_mlp": 0.01107813, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00042915, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.1111643015617814, + "language_loss": 0.67632484, + "learning_rate": 9.967413644401016e-07, + "loss": 0.69891578, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.5384411811828613 + }, + { + "auxiliary_loss_clip": 0.01132443, + "auxiliary_loss_mlp": 0.01107026, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.0005008, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 2.004784219401326, + "language_loss": 0.73285401, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75524867, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.558336019515991 + }, + { + "auxiliary_loss_clip": 0.01119012, + "auxiliary_loss_mlp": 0.01107286, + "balance_loss_clip": 1.00159919, + "balance_loss_mlp": 1.00047481, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.500433209017826, + "language_loss": 0.6126821, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63494515, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 2.605525255203247 + }, + { + "auxiliary_loss_clip": 0.01117199, + "auxiliary_loss_mlp": 0.011071, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00057435, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.8963148804133758, + "language_loss": 0.70616019, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72840321, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 3.9850757122039795 + }, + { + "auxiliary_loss_clip": 0.01165971, + "auxiliary_loss_mlp": 0.01107011, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.0004853, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.9460894433297191, + "language_loss": 0.70973444, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73246419, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.501459836959839 + }, + { + "auxiliary_loss_clip": 0.01135932, + "auxiliary_loss_mlp": 0.01107366, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00045955, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.4716485275221893, + "language_loss": 0.76821482, + "learning_rate": 9.950572574939194e-07, + "loss": 0.7906478, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.60547137260437 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01107789, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00059581, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.9828032257969745, + "language_loss": 0.74556196, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76781058, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.6096885204315186 + }, + { + "auxiliary_loss_clip": 0.01097242, + "auxiliary_loss_mlp": 0.01106633, + "balance_loss_clip": 1.00195348, + "balance_loss_mlp": 1.000489, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.5569382063675228, + "language_loss": 0.72875607, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75079477, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.683866262435913 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01107796, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00069821, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.6086671043662217, + "language_loss": 0.67931288, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70205015, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 2.5412611961364746 + }, + { + "auxiliary_loss_clip": 0.01149186, + "auxiliary_loss_mlp": 0.01107906, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.9753832831203109, + "language_loss": 0.73865509, + "learning_rate": 9.937106577958481e-07, + "loss": 0.761226, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.5259897708892822 + }, + { + "auxiliary_loss_clip": 0.01146953, + "auxiliary_loss_mlp": 0.01106898, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00056338, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.6191672832297581, + "language_loss": 0.70369059, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72622919, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.5456814765930176 + }, + { + "auxiliary_loss_clip": 0.01165839, + "auxiliary_loss_mlp": 0.01107321, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00050998, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.6122199424128443, + "language_loss": 0.65782154, + "learning_rate": 9.930375868473093e-07, + "loss": 0.6805532, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.5111846923828125 + }, + { + "auxiliary_loss_clip": 0.01151238, + "auxiliary_loss_mlp": 0.01107473, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00056648, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.525004542138333, + "language_loss": 0.72685164, + "learning_rate": 9.927011086428335e-07, + "loss": 0.7494387, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.579212188720703 + }, + { + "auxiliary_loss_clip": 0.01136133, + "auxiliary_loss_mlp": 0.00747658, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00059366, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.7862289972783816, + "language_loss": 0.76633304, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78517091, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.5876007080078125 + }, + { + "auxiliary_loss_clip": 0.01134523, + "auxiliary_loss_mlp": 0.01107761, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.00047231, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.909115986066812, + "language_loss": 0.83613193, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85855478, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.540025472640991 + }, + { + "auxiliary_loss_clip": 0.01132599, + "auxiliary_loss_mlp": 0.00747571, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00070024, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.4733690678744815, + "language_loss": 0.70593232, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72473407, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.638136386871338 + }, + { + "auxiliary_loss_clip": 0.01151263, + "auxiliary_loss_mlp": 0.01106125, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00045824, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.768623352524349, + "language_loss": 0.73841625, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76099014, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.5728564262390137 + }, + { + "auxiliary_loss_clip": 0.0115139, + "auxiliary_loss_mlp": 0.0110824, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.00047469, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.7458950101428814, + "language_loss": 0.70483124, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72742748, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.5651793479919434 + }, + { + "auxiliary_loss_clip": 0.01165842, + "auxiliary_loss_mlp": 0.01107944, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00046539, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.6431662570460457, + "language_loss": 0.63944244, + "learning_rate": 9.906830419968217e-07, + "loss": 0.6621803, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.5760862827301025 + }, + { + "auxiliary_loss_clip": 0.01121403, + "auxiliary_loss_mlp": 0.01108455, + "balance_loss_clip": 1.00181329, + "balance_loss_mlp": 1.00059414, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5090217442112495, + "language_loss": 0.74237448, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76467299, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.70060133934021 + }, + { + "auxiliary_loss_clip": 0.01150712, + "auxiliary_loss_mlp": 0.01105895, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00041819, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.8279519260573736, + "language_loss": 0.57348466, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59605068, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.666496753692627 + }, + { + "auxiliary_loss_clip": 0.01132617, + "auxiliary_loss_mlp": 0.01106762, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00042701, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 3.951038860709017, + "language_loss": 0.75695515, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77934897, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.555184841156006 + }, + { + "auxiliary_loss_clip": 0.01165771, + "auxiliary_loss_mlp": 0.01106336, + "balance_loss_clip": 1.00199604, + "balance_loss_mlp": 1.00057316, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.5722921642608065, + "language_loss": 0.66294134, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68566239, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.5141749382019043 + }, + { + "auxiliary_loss_clip": 0.01132434, + "auxiliary_loss_mlp": 0.01106821, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00048673, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.673519005920357, + "language_loss": 0.52913785, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55153036, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.60223126411438 + }, + { + "auxiliary_loss_clip": 0.01120306, + "auxiliary_loss_mlp": 0.01106922, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00068283, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.7818939397879665, + "language_loss": 0.77345884, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79573107, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 4.0785744190216064 + }, + { + "auxiliary_loss_clip": 0.01149469, + "auxiliary_loss_mlp": 0.011082, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00062585, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 3.03177462334209, + "language_loss": 0.73179948, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75437617, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.5820233821868896 + }, + { + "auxiliary_loss_clip": 0.01165891, + "auxiliary_loss_mlp": 0.01106941, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00060642, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.4404556482164657, + "language_loss": 0.80062771, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82335603, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.529691219329834 + }, + { + "auxiliary_loss_clip": 0.01149515, + "auxiliary_loss_mlp": 0.01106355, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00059271, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.4446129376564218, + "language_loss": 0.74957669, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77213538, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.5360841751098633 + }, + { + "auxiliary_loss_clip": 0.0113274, + "auxiliary_loss_mlp": 0.00747551, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00065279, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.8129542699456604, + "language_loss": 0.75570345, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77450639, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.616982936859131 + }, + { + "auxiliary_loss_clip": 0.01117765, + "auxiliary_loss_mlp": 0.0110745, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00044823, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 2.1161825917028576, + "language_loss": 0.8399545, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86220658, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.609098434448242 + }, + { + "auxiliary_loss_clip": 0.01166254, + "auxiliary_loss_mlp": 0.01109241, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00061715, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 7.059168823527081, + "language_loss": 0.79193342, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81468838, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.487640142440796 + }, + { + "auxiliary_loss_clip": 0.01132612, + "auxiliary_loss_mlp": 0.01106482, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00043297, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.61962091999813, + "language_loss": 0.78751224, + "learning_rate": 9.86315294700924e-07, + "loss": 0.80990314, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.575368642807007 + }, + { + "auxiliary_loss_clip": 0.01134102, + "auxiliary_loss_mlp": 0.01106159, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00058746, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.7033004004729502, + "language_loss": 0.71220535, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73460793, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.5836448669433594 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01107315, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.5286211995179932, + "language_loss": 0.7053529, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72792017, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 3.9579014778137207 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01107831, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00054228, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.096349413778648, + "language_loss": 0.66641271, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68868375, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.571233034133911 + }, + { + "auxiliary_loss_clip": 0.01151196, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.0004127, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.7387150415836166, + "language_loss": 0.72030842, + "learning_rate": 9.84972678083801e-07, + "loss": 0.74288595, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 4.252464532852173 + }, + { + "auxiliary_loss_clip": 0.01165947, + "auxiliary_loss_mlp": 0.01107175, + "balance_loss_clip": 1.00190163, + "balance_loss_mlp": 1.00064921, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.2407208186628007, + "language_loss": 0.77140999, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79414123, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.5388379096984863 + }, + { + "auxiliary_loss_clip": 0.01148831, + "auxiliary_loss_mlp": 0.01106988, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00055778, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.748764250169786, + "language_loss": 0.62963092, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65218914, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.510363817214966 + }, + { + "auxiliary_loss_clip": 0.01148714, + "auxiliary_loss_mlp": 0.01107059, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00053382, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.7611063439512467, + "language_loss": 0.82838225, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85094005, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.5714831352233887 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01107317, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00060093, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 1.8333310435711272, + "language_loss": 0.69483113, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71741092, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.544647693634033 + }, + { + "auxiliary_loss_clip": 0.01117715, + "auxiliary_loss_mlp": 0.01108288, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00052261, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 1.6634532152258559, + "language_loss": 0.70208418, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72434413, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 4.1176886558532715 + }, + { + "auxiliary_loss_clip": 0.01151339, + "auxiliary_loss_mlp": 0.01108384, + "balance_loss_clip": 1.00209486, + "balance_loss_mlp": 1.00061917, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 1.8148663318318283, + "language_loss": 0.72457153, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74716878, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.546389102935791 + }, + { + "auxiliary_loss_clip": 0.01132424, + "auxiliary_loss_mlp": 0.0110715, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00033808, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.9519745947682177, + "language_loss": 0.66010821, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68250394, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.5948433876037598 + }, + { + "auxiliary_loss_clip": 0.01132452, + "auxiliary_loss_mlp": 0.0110671, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00047088, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6411903647376236, + "language_loss": 0.80029953, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82269108, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.5696399211883545 + }, + { + "auxiliary_loss_clip": 0.01134125, + "auxiliary_loss_mlp": 0.01106862, + "balance_loss_clip": 1.00184834, + "balance_loss_mlp": 1.00052762, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.5702635747341709, + "language_loss": 0.89106703, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91347694, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.558237075805664 + }, + { + "auxiliary_loss_clip": 0.01119554, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_clip": 1.00175619, + "balance_loss_mlp": 1.00049162, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.256994328620249, + "language_loss": 0.71135664, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73362422, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.610783338546753 + }, + { + "auxiliary_loss_clip": 0.01116875, + "auxiliary_loss_mlp": 0.01106675, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00062621, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.3724439126214887, + "language_loss": 0.84141076, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86364627, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.6373023986816406 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.0110694, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00050962, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.0549552663161084, + "language_loss": 0.83030421, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85272121, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.5566372871398926 + }, + { + "auxiliary_loss_clip": 0.0110329, + "auxiliary_loss_mlp": 0.01108298, + "balance_loss_clip": 1.00185025, + "balance_loss_mlp": 1.00043726, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.2750207537716873, + "language_loss": 0.76244885, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78456473, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.6814723014831543 + }, + { + "auxiliary_loss_clip": 0.01161449, + "auxiliary_loss_mlp": 0.01084835, + "balance_loss_clip": 1.00112545, + "balance_loss_mlp": 1.00014901, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6532208315133269, + "language_loss": 0.57208169, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59454453, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.197181463241577 + }, + { + "auxiliary_loss_clip": 0.01149159, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00048232, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7121999107150419, + "language_loss": 0.6852507, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70780855, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.5978660583496094 + }, + { + "auxiliary_loss_clip": 0.01151233, + "auxiliary_loss_mlp": 0.01105986, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00041437, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.728776028671623, + "language_loss": 0.81465149, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83722365, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.484598398208618 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.01107317, + "balance_loss_clip": 1.00163198, + "balance_loss_mlp": 1.00041056, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.4574065487791237, + "language_loss": 0.69772118, + "learning_rate": 9.792734377526718e-07, + "loss": 0.71981895, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.6557390689849854 + }, + { + "auxiliary_loss_clip": 0.01149153, + "auxiliary_loss_mlp": 0.01107491, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00058424, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 1.919356261199508, + "language_loss": 0.6637814, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68634784, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.4968676567077637 + }, + { + "auxiliary_loss_clip": 0.01149359, + "auxiliary_loss_mlp": 0.01107815, + "balance_loss_clip": 1.00205421, + "balance_loss_mlp": 1.00071716, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.434586050869734, + "language_loss": 0.74600577, + "learning_rate": 9.78603673098082e-07, + "loss": 0.76857746, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 2.5709218978881836 + }, + { + "auxiliary_loss_clip": 0.01134315, + "auxiliary_loss_mlp": 0.01105093, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00047481, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.5694759105459346, + "language_loss": 0.682594, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70498812, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 2.5407302379608154 + }, + { + "auxiliary_loss_clip": 0.01100463, + "auxiliary_loss_mlp": 0.00747553, + "balance_loss_clip": 1.0015626, + "balance_loss_mlp": 1.0005722, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.681032858695084, + "language_loss": 0.76855028, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78703034, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.636014938354492 + }, + { + "auxiliary_loss_clip": 0.01132294, + "auxiliary_loss_mlp": 0.01107257, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00044596, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 1.8044587106502534, + "language_loss": 0.74919957, + "learning_rate": 9.77599316633817e-07, + "loss": 0.77159512, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.6106128692626953 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01107148, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00062275, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.996987462037791, + "language_loss": 0.72583747, + "learning_rate": 9.772646086678758e-07, + "loss": 0.7482326, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.5682597160339355 + }, + { + "auxiliary_loss_clip": 0.01101213, + "auxiliary_loss_mlp": 0.00747675, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00067616, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.4895882150324296, + "language_loss": 0.78389019, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80237901, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.656921863555908 + }, + { + "auxiliary_loss_clip": 0.01112509, + "auxiliary_loss_mlp": 0.01084399, + "balance_loss_clip": 1.00087905, + "balance_loss_mlp": 1.00009394, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7703022494014051, + "language_loss": 0.5710268, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59299588, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 2.9807474613189697 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01107833, + "balance_loss_clip": 1.00186193, + "balance_loss_mlp": 1.00064039, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.8652203087526578, + "language_loss": 0.68313825, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70555747, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.646486282348633 + }, + { + "auxiliary_loss_clip": 0.01151296, + "auxiliary_loss_mlp": 0.01107447, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00054002, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.1130815321738923, + "language_loss": 0.69921446, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72180188, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 3.9507174491882324 + }, + { + "auxiliary_loss_clip": 0.0116584, + "auxiliary_loss_mlp": 0.01106983, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00045764, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.7580872199277466, + "language_loss": 0.73151249, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75424075, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.519991397857666 + }, + { + "auxiliary_loss_clip": 0.0114921, + "auxiliary_loss_mlp": 0.0110708, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00045907, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.8144601961336595, + "language_loss": 0.77468884, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79725182, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 2.533987283706665 + }, + { + "auxiliary_loss_clip": 0.01165922, + "auxiliary_loss_mlp": 0.01106965, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00053513, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.7757844092043549, + "language_loss": 0.6447916, + "learning_rate": 9.74922739519265e-07, + "loss": 0.6675204, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.5002973079681396 + }, + { + "auxiliary_loss_clip": 0.01084768, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00165093, + "balance_loss_mlp": 1.00059128, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.8602415070734237, + "language_loss": 0.79274911, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81107187, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.678331136703491 + }, + { + "auxiliary_loss_clip": 0.01149151, + "auxiliary_loss_mlp": 0.01107615, + "balance_loss_clip": 1.00186813, + "balance_loss_mlp": 1.00051737, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 1.6636374596016739, + "language_loss": 0.63601458, + "learning_rate": 9.742539836972665e-07, + "loss": 0.65858221, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 2.5655596256256104 + }, + { + "auxiliary_loss_clip": 0.01102561, + "auxiliary_loss_mlp": 0.01106856, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00052094, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.460961704942927, + "language_loss": 0.72180355, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74389768, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.6244056224823 + }, + { + "auxiliary_loss_clip": 0.01149289, + "auxiliary_loss_mlp": 0.01107565, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00084949, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.206084759580303, + "language_loss": 0.74757659, + "learning_rate": 9.735853834608326e-07, + "loss": 0.77014512, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.5312979221343994 + }, + { + "auxiliary_loss_clip": 0.01149461, + "auxiliary_loss_mlp": 0.0110819, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00052047, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.426185006224191, + "language_loss": 0.72041452, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74299109, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.5667662620544434 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01107132, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.0006063, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.86713694169985, + "language_loss": 0.85889393, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88146031, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 3.8845016956329346 + }, + { + "auxiliary_loss_clip": 0.01150802, + "auxiliary_loss_mlp": 0.01106385, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00062251, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.6876881616415618, + "language_loss": 0.81850564, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84107757, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 3.9377353191375732 + }, + { + "auxiliary_loss_clip": 0.01117619, + "auxiliary_loss_mlp": 0.01106162, + "balance_loss_clip": 1.00165951, + "balance_loss_mlp": 1.0007813, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.979988706635261, + "language_loss": 0.81513178, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83736962, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.6336021423339844 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01106574, + "balance_loss_clip": 1.00173247, + "balance_loss_mlp": 1.00071597, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.9039029169208206, + "language_loss": 0.72568524, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74780029, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.6437950134277344 + }, + { + "auxiliary_loss_clip": 0.0110415, + "auxiliary_loss_mlp": 0.01106824, + "balance_loss_clip": 1.00166464, + "balance_loss_mlp": 1.00067985, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4351451939269109, + "language_loss": 0.77586424, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79797399, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.668754816055298 + }, + { + "auxiliary_loss_clip": 0.01119098, + "auxiliary_loss_mlp": 0.01107031, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00069666, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 1.789345278534328, + "language_loss": 0.70504969, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72731096, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.6649913787841797 + }, + { + "auxiliary_loss_clip": 0.01132602, + "auxiliary_loss_mlp": 0.0110765, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00055265, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.3034474259893756, + "language_loss": 0.83720028, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85960281, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.6040682792663574 + }, + { + "auxiliary_loss_clip": 0.01115633, + "auxiliary_loss_mlp": 0.01106634, + "balance_loss_clip": 1.00169647, + "balance_loss_mlp": 1.00068045, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.5743692002254104, + "language_loss": 0.68478578, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70700848, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 4.071541786193848 + }, + { + "auxiliary_loss_clip": 0.01097409, + "auxiliary_loss_mlp": 0.01106209, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00054157, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.634422494995446, + "language_loss": 0.75251704, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77455318, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.6671595573425293 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.01108467, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00060606, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6185224097743283, + "language_loss": 0.79664081, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81890297, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 2.679426908493042 + }, + { + "auxiliary_loss_clip": 0.0111713, + "auxiliary_loss_mlp": 0.01107073, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00073791, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.073119173306389, + "language_loss": 0.66517413, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68741608, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.697854518890381 + }, + { + "auxiliary_loss_clip": 0.01132915, + "auxiliary_loss_mlp": 0.01107822, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00062907, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.211021435549493, + "language_loss": 0.64821672, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67062408, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.568531036376953 + }, + { + "auxiliary_loss_clip": 0.01088951, + "auxiliary_loss_mlp": 0.007476, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00063384, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5555950654510766, + "language_loss": 0.78420687, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80257237, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.719414710998535 + }, + { + "auxiliary_loss_clip": 0.01146556, + "auxiliary_loss_mlp": 0.01084768, + "balance_loss_clip": 1.00112545, + "balance_loss_mlp": 1.0000819, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7193104737252126, + "language_loss": 0.52497876, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54729199, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.115114688873291 + }, + { + "auxiliary_loss_clip": 0.01165819, + "auxiliary_loss_mlp": 0.01106958, + "balance_loss_clip": 1.00188386, + "balance_loss_mlp": 1.00062287, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.6039265274952634, + "language_loss": 0.79704636, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81977409, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.4962270259857178 + }, + { + "auxiliary_loss_clip": 0.0114937, + "auxiliary_loss_mlp": 0.01109325, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00041509, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 2.0598898264441683, + "language_loss": 0.73805672, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76064366, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.6579673290252686 + }, + { + "auxiliary_loss_clip": 0.01165686, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_clip": 1.00185096, + "balance_loss_mlp": 1.00064778, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.4719124524101135, + "language_loss": 0.7922895, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81501526, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.537721633911133 + }, + { + "auxiliary_loss_clip": 0.01149058, + "auxiliary_loss_mlp": 0.01106558, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00060439, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.6254171000628048, + "language_loss": 0.73192734, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75448358, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.529066801071167 + }, + { + "auxiliary_loss_clip": 0.01102799, + "auxiliary_loss_mlp": 0.01108881, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00054407, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.4005093138309945, + "language_loss": 0.80012327, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82224, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.7100656032562256 + }, + { + "auxiliary_loss_clip": 0.0115053, + "auxiliary_loss_mlp": 0.01106789, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00064564, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.661963234082886, + "language_loss": 0.78338367, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80595684, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.5362606048583984 + }, + { + "auxiliary_loss_clip": 0.0108242, + "auxiliary_loss_mlp": 0.01105934, + "balance_loss_clip": 1.00157177, + "balance_loss_mlp": 1.00055325, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 1.8137348600517251, + "language_loss": 0.61695671, + "learning_rate": 9.662410784947599e-07, + "loss": 0.63884032, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.6979689598083496 + }, + { + "auxiliary_loss_clip": 0.01104275, + "auxiliary_loss_mlp": 0.01106278, + "balance_loss_clip": 1.0017637, + "balance_loss_mlp": 1.0005151, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.8785686565521778, + "language_loss": 0.82332981, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84543538, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.6229324340820312 + }, + { + "auxiliary_loss_clip": 0.01133626, + "auxiliary_loss_mlp": 0.01107391, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00048375, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 2.673047044161078, + "language_loss": 0.78426361, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80667377, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.569932222366333 + }, + { + "auxiliary_loss_clip": 0.0113253, + "auxiliary_loss_mlp": 0.01084447, + "balance_loss_clip": 1.00121546, + "balance_loss_mlp": 1.00014234, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.818641166390098, + "language_loss": 0.59616113, + "learning_rate": 9.65241049367493e-07, + "loss": 0.6183309, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 3.203216314315796 + }, + { + "auxiliary_loss_clip": 0.01121535, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00075364, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 2.3821572264420094, + "language_loss": 0.78758931, + "learning_rate": 9.64907784784544e-07, + "loss": 0.8099013, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.6142568588256836 + }, + { + "auxiliary_loss_clip": 0.01149225, + "auxiliary_loss_mlp": 0.01107142, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.0005213, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 2.2170797202558625, + "language_loss": 0.81745052, + "learning_rate": 9.645745594523958e-07, + "loss": 0.84001422, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.560822010040283 + }, + { + "auxiliary_loss_clip": 0.01150961, + "auxiliary_loss_mlp": 0.01107313, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00069213, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.9301006804130345, + "language_loss": 0.75524461, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77782726, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 2.5706450939178467 + }, + { + "auxiliary_loss_clip": 0.01128795, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_clip": 1.00159526, + "balance_loss_mlp": 1.00033915, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8659206512766053, + "language_loss": 0.59758633, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61972839, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.218850612640381 + }, + { + "auxiliary_loss_clip": 0.01134717, + "auxiliary_loss_mlp": 0.01106644, + "balance_loss_clip": 1.00160694, + "balance_loss_mlp": 1.00059581, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.984805118880251, + "language_loss": 0.75007188, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77248549, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 3.960754156112671 + }, + { + "auxiliary_loss_clip": 0.01132342, + "auxiliary_loss_mlp": 0.01107018, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00068378, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.2397257154307644, + "language_loss": 0.89462429, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91701794, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.5701189041137695 + }, + { + "auxiliary_loss_clip": 0.01134295, + "auxiliary_loss_mlp": 0.0110604, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00056338, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 1.910977643539173, + "language_loss": 0.88094729, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90335059, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 2.5872373580932617 + }, + { + "auxiliary_loss_clip": 0.01117828, + "auxiliary_loss_mlp": 0.01107698, + "balance_loss_clip": 1.00169253, + "balance_loss_mlp": 1.0006001, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.8463721332293725, + "language_loss": 0.81077325, + "learning_rate": 9.625760324338272e-07, + "loss": 0.8330285, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.622790813446045 + }, + { + "auxiliary_loss_clip": 0.01132457, + "auxiliary_loss_mlp": 0.0110659, + "balance_loss_clip": 1.00172627, + "balance_loss_mlp": 1.00054121, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.5072913883482455, + "language_loss": 0.76624441, + "learning_rate": 9.622430822110062e-07, + "loss": 0.7886349, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.610837459564209 + }, + { + "auxiliary_loss_clip": 0.0113241, + "auxiliary_loss_mlp": 0.011079, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00070715, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 3.5810088213592763, + "language_loss": 0.68870735, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71111047, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.5632293224334717 + }, + { + "auxiliary_loss_clip": 0.0111943, + "auxiliary_loss_mlp": 0.01106792, + "balance_loss_clip": 1.0017283, + "balance_loss_mlp": 1.00055313, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 2.1575393505552576, + "language_loss": 0.73065013, + "learning_rate": 9.615772998335261e-07, + "loss": 0.7529124, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.6287901401519775 + }, + { + "auxiliary_loss_clip": 0.01149519, + "auxiliary_loss_mlp": 0.01107915, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00062728, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 2.302759335893964, + "language_loss": 0.78693271, + "learning_rate": 9.612444677041138e-07, + "loss": 0.80950701, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.4987869262695312 + }, + { + "auxiliary_loss_clip": 0.01146811, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_clip": 1.00116706, + "balance_loss_mlp": 1.00037372, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7405325428109871, + "language_loss": 0.59862721, + "learning_rate": 9.609116749644162e-07, + "loss": 0.62094212, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.0005738735198975 + }, + { + "auxiliary_loss_clip": 0.01132478, + "auxiliary_loss_mlp": 0.01105629, + "balance_loss_clip": 1.00160396, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.5530352080115704, + "language_loss": 0.63773012, + "learning_rate": 9.605789216270511e-07, + "loss": 0.66011119, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 3.912140130996704 + }, + { + "auxiliary_loss_clip": 0.01149227, + "auxiliary_loss_mlp": 0.01107716, + "balance_loss_clip": 1.00208652, + "balance_loss_mlp": 1.00052261, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 2.7297282316254745, + "language_loss": 0.71886396, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74143344, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.546191453933716 + }, + { + "auxiliary_loss_clip": 0.01115552, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.00114107, + "balance_loss_mlp": 1.00009274, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.5086093095820057, + "language_loss": 0.56710362, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58910316, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 4.70028281211853 + }, + { + "auxiliary_loss_clip": 0.01149229, + "auxiliary_loss_mlp": 0.01107438, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00043571, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.6738663188767775, + "language_loss": 0.73799121, + "learning_rate": 9.595808981551312e-07, + "loss": 0.76055789, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.531856060028076 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01105768, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00057745, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 2.0010161659941503, + "language_loss": 0.7074976, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72989804, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.606795310974121 + }, + { + "auxiliary_loss_clip": 0.01165929, + "auxiliary_loss_mlp": 0.01107622, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.00071537, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 1.7184458230467188, + "language_loss": 0.74703217, + "learning_rate": 9.58915746416808e-07, + "loss": 0.7697677, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.5232274532318115 + }, + { + "auxiliary_loss_clip": 0.01131924, + "auxiliary_loss_mlp": 0.01084636, + "balance_loss_clip": 1.00112748, + "balance_loss_mlp": 1.0003314, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7204237202992947, + "language_loss": 0.56799817, + "learning_rate": 9.585832297583707e-07, + "loss": 0.59016383, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.253587484359741 + }, + { + "auxiliary_loss_clip": 0.01165886, + "auxiliary_loss_mlp": 0.01108187, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00070775, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.7225307204024223, + "language_loss": 0.7837981, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80653882, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.5135648250579834 + }, + { + "auxiliary_loss_clip": 0.01165862, + "auxiliary_loss_mlp": 0.01105699, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00050914, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 1.797915196312369, + "language_loss": 0.69540995, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71812558, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 3.912562847137451 + }, + { + "auxiliary_loss_clip": 0.01132362, + "auxiliary_loss_mlp": 0.01106755, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00061131, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 1.8652212375638182, + "language_loss": 0.78470862, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80709982, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.5658464431762695 + }, + { + "auxiliary_loss_clip": 0.01144428, + "auxiliary_loss_mlp": 0.01084741, + "balance_loss_clip": 1.00120997, + "balance_loss_mlp": 1.00043619, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8811574592881153, + "language_loss": 0.67270792, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69499969, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 2.9615602493286133 + }, + { + "auxiliary_loss_clip": 0.01145362, + "auxiliary_loss_mlp": 0.01084684, + "balance_loss_clip": 1.00112677, + "balance_loss_mlp": 1.00037909, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.821952935208597, + "language_loss": 0.58138227, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60368276, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.146256685256958 + }, + { + "auxiliary_loss_clip": 0.01100493, + "auxiliary_loss_mlp": 0.01105488, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 1.00048804, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 1.5911017852619924, + "language_loss": 0.7956934, + "learning_rate": 9.565889595521517e-07, + "loss": 0.8177532, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.719764232635498 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01107667, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00075972, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 1.6536170493883904, + "language_loss": 0.76985669, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79242504, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.539537191390991 + }, + { + "auxiliary_loss_clip": 0.01118174, + "auxiliary_loss_mlp": 0.01107944, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00065589, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.3607465276245443, + "language_loss": 0.8421191, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86438024, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.5864124298095703 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01107351, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00073004, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.328481250559986, + "language_loss": 0.83321309, + "learning_rate": 9.555923584232984e-07, + "loss": 0.8557747, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.0115133, + "auxiliary_loss_mlp": 0.01107489, + "balance_loss_clip": 1.00195706, + "balance_loss_mlp": 1.00058198, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.7946399756291818, + "language_loss": 0.71846247, + "learning_rate": 9.552602372383047e-07, + "loss": 0.7410506, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.667853832244873 + }, + { + "auxiliary_loss_clip": 0.0114958, + "auxiliary_loss_mlp": 0.01106045, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00056887, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.8193267631660226, + "language_loss": 0.62448311, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64703941, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 2.7176291942596436 + }, + { + "auxiliary_loss_clip": 0.01127748, + "auxiliary_loss_mlp": 0.01084495, + "balance_loss_clip": 1.00117683, + "balance_loss_mlp": 1.00019002, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7287611587162657, + "language_loss": 0.55988503, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58200747, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.259221315383911 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.00747517, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00048351, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 1.985930322373402, + "language_loss": 0.87642866, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89508295, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.62494158744812 + }, + { + "auxiliary_loss_clip": 0.01105001, + "auxiliary_loss_mlp": 0.01107445, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00063324, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.5746747542765385, + "language_loss": 0.79159433, + "learning_rate": 9.539321487906117e-07, + "loss": 0.8137188, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.6883420944213867 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0110606, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.0004878, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.2517182789650003, + "language_loss": 0.7078954, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73029459, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.0110712, + "balance_loss_clip": 1.00169766, + "balance_loss_mlp": 1.00059414, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.470127708653885, + "language_loss": 0.64423156, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66632748, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 2.6939969062805176 + }, + { + "auxiliary_loss_clip": 0.0113475, + "auxiliary_loss_mlp": 0.00747659, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00061035, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 1.4759855204015204, + "language_loss": 0.80849361, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82731771, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.6482017040252686 + }, + { + "auxiliary_loss_clip": 0.01115839, + "auxiliary_loss_mlp": 0.01106899, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00056458, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.5978153269872963, + "language_loss": 0.7315414, + "learning_rate": 9.526046950148527e-07, + "loss": 0.7537688, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.644338369369507 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01107615, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00061297, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.0822227379704215, + "language_loss": 0.7877124, + "learning_rate": 9.522729308327931e-07, + "loss": 0.80997235, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.696779489517212 + }, + { + "auxiliary_loss_clip": 0.01069754, + "auxiliary_loss_mlp": 0.011069, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00056577, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 2.01960459351817, + "language_loss": 0.71305621, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73482275, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.7786333560943604 + }, + { + "auxiliary_loss_clip": 0.01101242, + "auxiliary_loss_mlp": 0.01106794, + "balance_loss_clip": 1.00171053, + "balance_loss_mlp": 1.00065041, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.6610995329269618, + "language_loss": 0.70671266, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72879303, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 2.98203706741333 + }, + { + "auxiliary_loss_clip": 0.01151487, + "auxiliary_loss_mlp": 0.01106966, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00072706, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5363381951019144, + "language_loss": 0.70394653, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72653109, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 2.515380382537842 + }, + { + "auxiliary_loss_clip": 0.01101692, + "auxiliary_loss_mlp": 0.01108775, + "balance_loss_clip": 1.00163436, + "balance_loss_mlp": 1.00062788, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.8169518303327428, + "language_loss": 0.77924722, + "learning_rate": 9.509462715294927e-07, + "loss": 0.8013519, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 4.030602216720581 + }, + { + "auxiliary_loss_clip": 0.01165793, + "auxiliary_loss_mlp": 0.01106401, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00063801, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 1.6881567702538212, + "language_loss": 0.75718075, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77990264, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.4843907356262207 + }, + { + "auxiliary_loss_clip": 0.01151437, + "auxiliary_loss_mlp": 0.01107791, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00078893, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 2.9024523958537927, + "language_loss": 0.72983098, + "learning_rate": 9.502831805088742e-07, + "loss": 0.75242329, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.5545494556427 + }, + { + "auxiliary_loss_clip": 0.01165854, + "auxiliary_loss_mlp": 0.01106842, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00069845, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.488766700740878, + "language_loss": 0.81043589, + "learning_rate": 9.499516947003294e-07, + "loss": 0.8331629, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.4809796810150146 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01108148, + "balance_loss_clip": 1.00199282, + "balance_loss_mlp": 1.000669, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3977432335047895, + "language_loss": 0.7776528, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80008096, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.01146679, + "auxiliary_loss_mlp": 0.01084455, + "balance_loss_clip": 1.00113034, + "balance_loss_mlp": 1.00014997, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7891091871023278, + "language_loss": 0.61000156, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63231289, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.171647071838379 + }, + { + "auxiliary_loss_clip": 0.01116942, + "auxiliary_loss_mlp": 0.01107775, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00058222, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.5971444047620644, + "language_loss": 0.76604164, + "learning_rate": 9.489574762325907e-07, + "loss": 0.78828883, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.603342056274414 + }, + { + "auxiliary_loss_clip": 0.01132466, + "auxiliary_loss_mlp": 0.01107819, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00062609, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 4.06661194140193, + "language_loss": 0.71295029, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73535311, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.5993120670318604 + }, + { + "auxiliary_loss_clip": 0.01149258, + "auxiliary_loss_mlp": 0.01107361, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00045466, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.6606616404747407, + "language_loss": 0.7004627, + "learning_rate": 9.482948631780087e-07, + "loss": 0.7230289, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.517554998397827 + }, + { + "auxiliary_loss_clip": 0.01101994, + "auxiliary_loss_mlp": 0.01105416, + "balance_loss_clip": 1.00171185, + "balance_loss_mlp": 1.00041628, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.627912866766131, + "language_loss": 0.78520393, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80727798, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 4.197850704193115 + }, + { + "auxiliary_loss_clip": 0.01148792, + "auxiliary_loss_mlp": 0.01108984, + "balance_loss_clip": 1.00179327, + "balance_loss_mlp": 1.00055099, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 3.015769514014542, + "language_loss": 0.71780157, + "learning_rate": 9.476324096464821e-07, + "loss": 0.74037933, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 3.9443445205688477 + }, + { + "auxiliary_loss_clip": 0.01102985, + "auxiliary_loss_mlp": 0.01107413, + "balance_loss_clip": 1.0016619, + "balance_loss_mlp": 1.0006969, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 1.8586566169590457, + "language_loss": 0.69865453, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72075844, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.633042335510254 + }, + { + "auxiliary_loss_clip": 0.0116577, + "auxiliary_loss_mlp": 0.01106697, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00055337, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 2.9927515253386083, + "language_loss": 0.71069276, + "learning_rate": 9.469701157384919e-07, + "loss": 0.73341745, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.437349319458008 + }, + { + "auxiliary_loss_clip": 0.01149458, + "auxiliary_loss_mlp": 0.01107386, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00076485, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.618261954869619, + "language_loss": 0.73731434, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75988275, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.5209169387817383 + }, + { + "auxiliary_loss_clip": 0.01134881, + "auxiliary_loss_mlp": 0.01108003, + "balance_loss_clip": 1.00177526, + "balance_loss_mlp": 1.00042892, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.0973352537886614, + "language_loss": 0.86579871, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88822758, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.5629236698150635 + }, + { + "auxiliary_loss_clip": 0.0114928, + "auxiliary_loss_mlp": 0.01108387, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00052619, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.6036975053712117, + "language_loss": 0.67182696, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69440365, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.5887656211853027 + }, + { + "auxiliary_loss_clip": 0.01135123, + "auxiliary_loss_mlp": 0.01106623, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00047934, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3266131038346722, + "language_loss": 0.75892991, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78134739, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 3.9728410243988037 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01107919, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00063074, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 2.0114580097932087, + "language_loss": 0.77597034, + "learning_rate": 9.45315079980678e-07, + "loss": 0.7983911, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.569361925125122 + }, + { + "auxiliary_loss_clip": 0.01100895, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_clip": 1.00165248, + "balance_loss_mlp": 1.00041246, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.60513734563816, + "language_loss": 0.76325083, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78533489, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.680870532989502 + }, + { + "auxiliary_loss_clip": 0.01165761, + "auxiliary_loss_mlp": 0.01106942, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.0007987, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.6807204549419907, + "language_loss": 0.71531314, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73804021, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.4796671867370605 + }, + { + "auxiliary_loss_clip": 0.01116811, + "auxiliary_loss_mlp": 0.01107437, + "balance_loss_clip": 1.00162554, + "balance_loss_mlp": 1.0004344, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.5889533093976793, + "language_loss": 0.74664152, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76888406, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.7417423725128174 + }, + { + "auxiliary_loss_clip": 0.01148991, + "auxiliary_loss_mlp": 0.01107034, + "balance_loss_clip": 1.00182283, + "balance_loss_mlp": 1.00069976, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.8979044209035916, + "language_loss": 0.76988316, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79244345, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.544464111328125 + }, + { + "auxiliary_loss_clip": 0.0115022, + "auxiliary_loss_mlp": 0.01107329, + "balance_loss_clip": 1.00187874, + "balance_loss_mlp": 1.00070834, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.9311955205249345, + "language_loss": 0.77055895, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79313445, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.605574369430542 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01107668, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.0006659, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.3370345247509963, + "language_loss": 0.72732258, + "learning_rate": 9.433303570032129e-07, + "loss": 0.74956691, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.6242218017578125 + }, + { + "auxiliary_loss_clip": 0.01132402, + "auxiliary_loss_mlp": 0.01106578, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00043428, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.6815979816976137, + "language_loss": 0.65064538, + "learning_rate": 9.429997100087112e-07, + "loss": 0.6730352, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.6189258098602295 + }, + { + "auxiliary_loss_clip": 0.0111593, + "auxiliary_loss_mlp": 0.01106501, + "balance_loss_clip": 1.00165224, + "balance_loss_mlp": 1.00045252, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.7681644633947138, + "language_loss": 0.71666211, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73888642, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.616004467010498 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01107224, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00060272, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 1.8921555346552321, + "language_loss": 0.84674823, + "learning_rate": 9.423385362769136e-07, + "loss": 0.86884904, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.6488845348358154 + }, + { + "auxiliary_loss_clip": 0.01148625, + "auxiliary_loss_mlp": 0.0110698, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00055027, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.434942279596459, + "language_loss": 0.76389629, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78645235, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 2.598433256149292 + }, + { + "auxiliary_loss_clip": 0.01117654, + "auxiliary_loss_mlp": 0.01107702, + "balance_loss_clip": 1.00170565, + "balance_loss_mlp": 1.0005095, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.1217862384478523, + "language_loss": 0.73067951, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75293303, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.6598715782165527 + }, + { + "auxiliary_loss_clip": 0.01134501, + "auxiliary_loss_mlp": 0.01107909, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00033474, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 4.730481405308974, + "language_loss": 0.83081758, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85324168, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.643388271331787 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01106379, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00052118, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 1.893965363088241, + "language_loss": 0.70203084, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72460645, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.555269718170166 + }, + { + "auxiliary_loss_clip": 0.01134284, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.00052583, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.8732273940911925, + "language_loss": 0.80200118, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82081997, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.613894462585449 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00048697, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.8059599809250788, + "language_loss": 0.67726415, + "learning_rate": 9.403559780416295e-07, + "loss": 0.69966686, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 2.6116437911987305 + }, + { + "auxiliary_loss_clip": 0.01149356, + "auxiliary_loss_mlp": 0.01106293, + "balance_loss_clip": 1.0021075, + "balance_loss_mlp": 1.00062633, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 2.1904201755188737, + "language_loss": 0.72854602, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75110251, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 2.6568055152893066 + }, + { + "auxiliary_loss_clip": 0.01118092, + "auxiliary_loss_mlp": 0.01107848, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00056016, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.7305264042015953, + "language_loss": 0.80493689, + "learning_rate": 9.396954466173657e-07, + "loss": 0.8271963, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.604983329772949 + }, + { + "auxiliary_loss_clip": 0.01165929, + "auxiliary_loss_mlp": 0.01107562, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00065517, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 3.5839144490029153, + "language_loss": 0.81305075, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83578563, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.49300217628479 + }, + { + "auxiliary_loss_clip": 0.01119513, + "auxiliary_loss_mlp": 0.01105428, + "balance_loss_clip": 1.00170588, + "balance_loss_mlp": 1.00061953, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 2.7188590049424213, + "language_loss": 0.81884074, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84109014, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.663137197494507 + }, + { + "auxiliary_loss_clip": 0.01134567, + "auxiliary_loss_mlp": 0.0110837, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00060463, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 3.110192726215079, + "language_loss": 0.779167, + "learning_rate": 9.387049510636793e-07, + "loss": 0.8015964, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 4.078432559967041 + }, + { + "auxiliary_loss_clip": 0.01165715, + "auxiliary_loss_mlp": 0.01106516, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00065851, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.468505798821421, + "language_loss": 0.71915543, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74187773, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.5485756397247314 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00059545, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.8254618683138832, + "language_loss": 0.75496733, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77752173, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.4955978393554688 + }, + { + "auxiliary_loss_clip": 0.01118072, + "auxiliary_loss_mlp": 0.01106694, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00064576, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.5357011677399661, + "language_loss": 0.71953112, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74177879, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.6215620040893555 + }, + { + "auxiliary_loss_clip": 0.01119833, + "auxiliary_loss_mlp": 0.01108159, + "balance_loss_clip": 1.00169909, + "balance_loss_mlp": 1.00058436, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.71519709356748, + "language_loss": 0.6638059, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68608582, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.5975191593170166 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01107102, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00057697, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 2.004850013227425, + "language_loss": 0.69639981, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71896476, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.561450242996216 + }, + { + "auxiliary_loss_clip": 0.0113613, + "auxiliary_loss_mlp": 0.01107354, + "balance_loss_clip": 1.00201499, + "balance_loss_mlp": 1.00063753, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4071555270849665, + "language_loss": 0.76113921, + "learning_rate": 9.367250468933893e-07, + "loss": 0.7835741, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.6448092460632324 + }, + { + "auxiliary_loss_clip": 0.01165737, + "auxiliary_loss_mlp": 0.01107012, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00048649, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 1.9584613030829148, + "language_loss": 0.76541543, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78814292, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.52252197265625 + }, + { + "auxiliary_loss_clip": 0.01146809, + "auxiliary_loss_mlp": 0.01084431, + "balance_loss_clip": 1.00120592, + "balance_loss_mlp": 1.00012577, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8192744834590144, + "language_loss": 0.58366406, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60597646, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.17085599899292 + }, + { + "auxiliary_loss_clip": 0.01151327, + "auxiliary_loss_mlp": 0.01107757, + "balance_loss_clip": 1.00188482, + "balance_loss_mlp": 1.00037301, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.4604368297851698, + "language_loss": 0.75581986, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77841067, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 3.9805610179901123 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01107119, + "balance_loss_clip": 1.00174856, + "balance_loss_mlp": 1.00049841, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.207223346598322, + "language_loss": 0.73191833, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75433332, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 4.014413833618164 + }, + { + "auxiliary_loss_clip": 0.01150968, + "auxiliary_loss_mlp": 0.01107707, + "balance_loss_clip": 1.00186861, + "balance_loss_mlp": 1.00070477, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.5623932688157403, + "language_loss": 0.74780732, + "learning_rate": 9.350762354227673e-07, + "loss": 0.77039409, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.572230339050293 + }, + { + "auxiliary_loss_clip": 0.01165788, + "auxiliary_loss_mlp": 0.01106706, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00056195, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.983478275145202, + "language_loss": 0.70096076, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72368568, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.5275309085845947 + }, + { + "auxiliary_loss_clip": 0.01114656, + "auxiliary_loss_mlp": 0.01108363, + "balance_loss_clip": 1.00203907, + "balance_loss_mlp": 1.00059748, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.691649799233177, + "language_loss": 0.75716138, + "learning_rate": 9.344169934211068e-07, + "loss": 0.77939159, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.6443214416503906 + }, + { + "auxiliary_loss_clip": 0.01149203, + "auxiliary_loss_mlp": 0.01107821, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.0004375, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.324539353454482, + "language_loss": 0.69611585, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71868604, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.5937843322753906 + }, + { + "auxiliary_loss_clip": 0.01165857, + "auxiliary_loss_mlp": 0.01108349, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00067902, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 2.128998133954105, + "language_loss": 0.71899664, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74173874, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.4885573387145996 + }, + { + "auxiliary_loss_clip": 0.01146567, + "auxiliary_loss_mlp": 0.00746407, + "balance_loss_clip": 1.00119221, + "balance_loss_mlp": 1.00062585, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7864100620429197, + "language_loss": 0.50685859, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52578831, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 2.982266664505005 + }, + { + "auxiliary_loss_clip": 0.01148492, + "auxiliary_loss_mlp": 0.01106335, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.00057292, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.7159034455508526, + "language_loss": 0.75471318, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77726144, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 3.9574544429779053 + }, + { + "auxiliary_loss_clip": 0.01134516, + "auxiliary_loss_mlp": 0.01107954, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00066566, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.0414807055707396, + "language_loss": 0.72625798, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74868262, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.536017656326294 + }, + { + "auxiliary_loss_clip": 0.01134387, + "auxiliary_loss_mlp": 0.01106211, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.00082982, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.5589572152892117, + "language_loss": 0.81186211, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83426803, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.6051533222198486 + }, + { + "auxiliary_loss_clip": 0.01148761, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00050735, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5005137993626907, + "language_loss": 0.7627672, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78173083, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.566344976425171 + }, + { + "auxiliary_loss_clip": 0.01165758, + "auxiliary_loss_mlp": 0.01106562, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00060916, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.3764633313096497, + "language_loss": 0.68414545, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70686871, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.443431854248047 + }, + { + "auxiliary_loss_clip": 0.01116762, + "auxiliary_loss_mlp": 0.01106451, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00059271, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.6915617415765716, + "language_loss": 0.68530321, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70753533, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.6479437351226807 + }, + { + "auxiliary_loss_clip": 0.01118126, + "auxiliary_loss_mlp": 0.01107736, + "balance_loss_clip": 1.00165808, + "balance_loss_mlp": 1.00044727, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.8455777002564815, + "language_loss": 0.76916069, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79141933, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.6180968284606934 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.0006125, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6815949153176704, + "language_loss": 0.69506133, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71746379, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.60286021232605 + }, + { + "auxiliary_loss_clip": 0.01149266, + "auxiliary_loss_mlp": 0.01107004, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.0004791, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4907162119726265, + "language_loss": 0.87576509, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89832783, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.525054693222046 + }, + { + "auxiliary_loss_clip": 0.01100569, + "auxiliary_loss_mlp": 0.01105277, + "balance_loss_clip": 1.00161028, + "balance_loss_mlp": 1.0004679, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.4732363031267341, + "language_loss": 0.68443704, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70649552, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.620605945587158 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.01107146, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00062072, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.7107599145027448, + "language_loss": 0.65151662, + "learning_rate": 9.298068305916373e-07, + "loss": 0.6740799, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.6061627864837646 + }, + { + "auxiliary_loss_clip": 0.01149132, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00062394, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.3758953593483279, + "language_loss": 0.72544897, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74801177, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.5923402309417725 + }, + { + "auxiliary_loss_clip": 0.0116581, + "auxiliary_loss_mlp": 0.0110624, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00047731, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.6493390399102783, + "language_loss": 0.71967113, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74239159, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.477325677871704 + }, + { + "auxiliary_loss_clip": 0.01132753, + "auxiliary_loss_mlp": 0.01107134, + "balance_loss_clip": 1.00167382, + "balance_loss_mlp": 1.00070369, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 1.9322572220327623, + "language_loss": 0.81111485, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83351374, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.562633991241455 + }, + { + "auxiliary_loss_clip": 0.01166044, + "auxiliary_loss_mlp": 0.01107952, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00066364, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.7302942993163695, + "language_loss": 0.66031432, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68305433, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.6096060276031494 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01083667, + "balance_loss_clip": 1.00119996, + "balance_loss_mlp": 1.00012553, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.7999073497159561, + "language_loss": 0.55145121, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57373655, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.0164759159088135 + }, + { + "auxiliary_loss_clip": 0.01149024, + "auxiliary_loss_mlp": 0.01105852, + "balance_loss_clip": 1.00189781, + "balance_loss_mlp": 1.00066137, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.6911328399409051, + "language_loss": 0.77849036, + "learning_rate": 9.278334794344715e-07, + "loss": 0.8010391, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.5334689617156982 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01106933, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.0006938, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.6036866391265567, + "language_loss": 0.78275818, + "learning_rate": 9.275047298005232e-07, + "loss": 0.8051883, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.5798938274383545 + }, + { + "auxiliary_loss_clip": 0.01131046, + "auxiliary_loss_mlp": 0.01106758, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00061464, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.468903720741073, + "language_loss": 0.7639457, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78632373, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 2.5952234268188477 + }, + { + "auxiliary_loss_clip": 0.01119597, + "auxiliary_loss_mlp": 0.01107757, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00065911, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 1.8040180971556465, + "language_loss": 0.75415933, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77643281, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.6900975704193115 + }, + { + "auxiliary_loss_clip": 0.0108595, + "auxiliary_loss_mlp": 0.01105675, + "balance_loss_clip": 1.00159907, + "balance_loss_mlp": 1.00038886, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.9791695964084324, + "language_loss": 0.74246418, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76438051, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.72697377204895 + }, + { + "auxiliary_loss_clip": 0.01121093, + "auxiliary_loss_mlp": 0.01107336, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00062025, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.2528435883532665, + "language_loss": 0.88447857, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90676296, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 4.012494325637817 + }, + { + "auxiliary_loss_clip": 0.01165725, + "auxiliary_loss_mlp": 0.01106443, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00049007, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3580665522552127, + "language_loss": 0.70343655, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72615826, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 2.5530824661254883 + }, + { + "auxiliary_loss_clip": 0.01149309, + "auxiliary_loss_mlp": 0.01106618, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00056911, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.1580775130184167, + "language_loss": 0.68773758, + "learning_rate": 9.255330864847313e-07, + "loss": 0.71029687, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 2.5807933807373047 + }, + { + "auxiliary_loss_clip": 0.01149136, + "auxiliary_loss_mlp": 0.01106887, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00064826, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.6638281991751451, + "language_loss": 0.76021492, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78277516, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.509018898010254 + }, + { + "auxiliary_loss_clip": 0.01151086, + "auxiliary_loss_mlp": 0.011071, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.0004797, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 2.193145017980108, + "language_loss": 0.7840845, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80666637, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.5444788932800293 + }, + { + "auxiliary_loss_clip": 0.01121824, + "auxiliary_loss_mlp": 0.0110615, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00048327, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.5639380469420503, + "language_loss": 0.75288981, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77516961, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.666126251220703 + }, + { + "auxiliary_loss_clip": 0.01117432, + "auxiliary_loss_mlp": 0.01106937, + "balance_loss_clip": 1.00176299, + "balance_loss_mlp": 1.00050688, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6378233452674937, + "language_loss": 0.69289106, + "learning_rate": 9.24219472319246e-07, + "loss": 0.7151348, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.684297800064087 + }, + { + "auxiliary_loss_clip": 0.01165812, + "auxiliary_loss_mlp": 0.01106284, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.0005213, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.4031381943036856, + "language_loss": 0.82710475, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84982568, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.4974915981292725 + }, + { + "auxiliary_loss_clip": 0.01165856, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.0004909, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.1989628867517497, + "language_loss": 0.65484262, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67757797, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.5353024005889893 + }, + { + "auxiliary_loss_clip": 0.01134319, + "auxiliary_loss_mlp": 0.0110684, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00050497, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.4668584609708244, + "language_loss": 0.7354871, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75789869, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 4.049460172653198 + }, + { + "auxiliary_loss_clip": 0.01149478, + "auxiliary_loss_mlp": 0.00747641, + "balance_loss_clip": 1.00182116, + "balance_loss_mlp": 1.0005703, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.7115735566218393, + "language_loss": 0.84932792, + "learning_rate": 9.22906510853017e-07, + "loss": 0.86829907, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 3.912752866744995 + }, + { + "auxiliary_loss_clip": 0.0108613, + "auxiliary_loss_mlp": 0.01107049, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00061905, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.7649566360708204, + "language_loss": 0.72853374, + "learning_rate": 9.225783725640786e-07, + "loss": 0.75046557, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.726034164428711 + }, + { + "auxiliary_loss_clip": 0.01130757, + "auxiliary_loss_mlp": 0.01083574, + "balance_loss_clip": 1.00120401, + "balance_loss_mlp": 1.00003195, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8907184037300518, + "language_loss": 0.66656083, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68870419, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 3.1729001998901367 + }, + { + "auxiliary_loss_clip": 0.01132445, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00067401, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 2.179346503677414, + "language_loss": 0.74693692, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76934767, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.5904290676116943 + }, + { + "auxiliary_loss_clip": 0.01151179, + "auxiliary_loss_mlp": 0.01108012, + "balance_loss_clip": 1.00194681, + "balance_loss_mlp": 1.00062871, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.7533101109397218, + "language_loss": 0.61743087, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64002275, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 2.5517446994781494 + }, + { + "auxiliary_loss_clip": 0.01132395, + "auxiliary_loss_mlp": 0.01106542, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00058901, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.6183264277466336, + "language_loss": 0.72885251, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75124192, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.615703582763672 + }, + { + "auxiliary_loss_clip": 0.01132128, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.00162458, + "balance_loss_mlp": 1.00053704, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.3974724277434893, + "language_loss": 0.70182896, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72062612, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.681074857711792 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.01106739, + "balance_loss_clip": 1.00169754, + "balance_loss_mlp": 1.00059497, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.6708921694319823, + "language_loss": 0.74503434, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76710701, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 4.0472235679626465 + }, + { + "auxiliary_loss_clip": 0.01165805, + "auxiliary_loss_mlp": 0.01106773, + "balance_loss_clip": 1.00195658, + "balance_loss_mlp": 1.00062943, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.8534440397890442, + "language_loss": 0.74425983, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76698565, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.4989824295043945 + }, + { + "auxiliary_loss_clip": 0.01134016, + "auxiliary_loss_mlp": 0.01106615, + "balance_loss_clip": 1.00163269, + "balance_loss_mlp": 1.00056672, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.5359964703822073, + "language_loss": 0.68139195, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70379829, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.650526285171509 + }, + { + "auxiliary_loss_clip": 0.01133761, + "auxiliary_loss_mlp": 0.01106898, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00056374, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.7759297522450666, + "language_loss": 0.73954254, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76194906, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.5660996437072754 + }, + { + "auxiliary_loss_clip": 0.01118609, + "auxiliary_loss_mlp": 0.01106037, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.00056052, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6122425707194867, + "language_loss": 0.7975949, + "learning_rate": 9.19299238803515e-07, + "loss": 0.81984127, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.601661443710327 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_clip": 1.00190079, + "balance_loss_mlp": 1.00070143, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.6455753098548656, + "language_loss": 0.80733919, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82960832, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.6396186351776123 + }, + { + "auxiliary_loss_clip": 0.0114892, + "auxiliary_loss_mlp": 0.01106487, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00072432, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.5850942605085137, + "language_loss": 0.85829729, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88085139, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.6303482055664062 + }, + { + "auxiliary_loss_clip": 0.01130728, + "auxiliary_loss_mlp": 0.00747514, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00061893, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.6421173052913174, + "language_loss": 0.75835264, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77713513, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.57706880569458 + }, + { + "auxiliary_loss_clip": 0.01089523, + "auxiliary_loss_mlp": 0.01107903, + "balance_loss_clip": 1.00178266, + "balance_loss_mlp": 1.00061488, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.955586536861065, + "language_loss": 0.77565348, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79762781, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.748965263366699 + }, + { + "auxiliary_loss_clip": 0.01151324, + "auxiliary_loss_mlp": 0.0110707, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00073528, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.7787373778598004, + "language_loss": 0.73358619, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75617015, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.559563159942627 + }, + { + "auxiliary_loss_clip": 0.01066879, + "auxiliary_loss_mlp": 0.01107168, + "balance_loss_clip": 1.00157833, + "balance_loss_mlp": 1.00054717, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 1.7672068329300397, + "language_loss": 0.73107868, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75281918, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.812492847442627 + }, + { + "auxiliary_loss_clip": 0.01150709, + "auxiliary_loss_mlp": 0.01106804, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00056434, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.703764456571818, + "language_loss": 0.76991248, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79248756, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.8245832920074463 + }, + { + "auxiliary_loss_clip": 0.01119713, + "auxiliary_loss_mlp": 0.01107086, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00065565, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.5784119858355223, + "language_loss": 0.73192453, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75419253, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.6378977298736572 + }, + { + "auxiliary_loss_clip": 0.01088548, + "auxiliary_loss_mlp": 0.00747452, + "balance_loss_clip": 1.00168586, + "balance_loss_mlp": 1.00044537, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.7793635368165062, + "language_loss": 0.87907827, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89743829, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.7234840393066406 + }, + { + "auxiliary_loss_clip": 0.01132545, + "auxiliary_loss_mlp": 0.0110691, + "balance_loss_clip": 1.00162768, + "balance_loss_mlp": 1.00047994, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 5.648808418989104, + "language_loss": 0.69988143, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72227603, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.6501708030700684 + }, + { + "auxiliary_loss_clip": 0.01130556, + "auxiliary_loss_mlp": 0.01105775, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00048876, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.781144254396436, + "language_loss": 0.76996654, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79232991, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 2.569324254989624 + }, + { + "auxiliary_loss_clip": 0.01150883, + "auxiliary_loss_mlp": 0.01105663, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00066328, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.809962189994337, + "language_loss": 0.74896741, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77153289, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.5850183963775635 + }, + { + "auxiliary_loss_clip": 0.01100554, + "auxiliary_loss_mlp": 0.01106291, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00052845, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 3.7329849447665904, + "language_loss": 0.64211363, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66418207, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.6901867389678955 + }, + { + "auxiliary_loss_clip": 0.01117242, + "auxiliary_loss_mlp": 0.01107701, + "balance_loss_clip": 1.00178778, + "balance_loss_mlp": 1.0006032, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 2.1383685051769357, + "language_loss": 0.75405431, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77630377, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.6049861907958984 + }, + { + "auxiliary_loss_clip": 0.0111961, + "auxiliary_loss_mlp": 0.01106753, + "balance_loss_clip": 1.00189364, + "balance_loss_mlp": 1.00051355, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.5997336811999483, + "language_loss": 0.62410128, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64636493, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 2.6764559745788574 + }, + { + "auxiliary_loss_clip": 0.01115251, + "auxiliary_loss_mlp": 0.01107085, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00055921, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.6502261903836473, + "language_loss": 0.82789969, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85012305, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.6070127487182617 + }, + { + "auxiliary_loss_clip": 0.01080006, + "auxiliary_loss_mlp": 0.01105925, + "balance_loss_clip": 1.00166142, + "balance_loss_mlp": 1.00063956, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.3729580205670777, + "language_loss": 0.78190899, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80376828, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 4.288065195083618 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01106901, + "balance_loss_clip": 1.00171995, + "balance_loss_mlp": 1.00056648, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 2.1753988980045134, + "language_loss": 0.74833691, + "learning_rate": 9.134071334081907e-07, + "loss": 0.77042931, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.677711248397827 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01106067, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00059068, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 1.8706126986754936, + "language_loss": 0.53050232, + "learning_rate": 9.130801849869694e-07, + "loss": 0.55259287, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 2.71516489982605 + }, + { + "auxiliary_loss_clip": 0.01151031, + "auxiliary_loss_mlp": 0.01106142, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00057065, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6448051216808952, + "language_loss": 0.73248851, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75506026, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 2.525064706802368 + }, + { + "auxiliary_loss_clip": 0.01165775, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_clip": 1.00190818, + "balance_loss_mlp": 1.00059342, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.9411010581738402, + "language_loss": 0.76071125, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78344494, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.4733481407165527 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01106971, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00054133, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.4496034282380938, + "language_loss": 0.64443278, + "learning_rate": 9.120995870695376e-07, + "loss": 0.6669966, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.6980066299438477 + }, + { + "auxiliary_loss_clip": 0.01134847, + "auxiliary_loss_mlp": 0.01107205, + "balance_loss_clip": 1.00190902, + "balance_loss_mlp": 1.0007751, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.987152057345548, + "language_loss": 0.62549603, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64791656, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.584625482559204 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01108085, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00079632, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.820058873523459, + "language_loss": 0.7766425, + "learning_rate": 9.114460613703887e-07, + "loss": 0.7989217, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.684284210205078 + }, + { + "auxiliary_loss_clip": 0.01150941, + "auxiliary_loss_mlp": 0.01107654, + "balance_loss_clip": 1.00193655, + "balance_loss_mlp": 1.0006516, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.7124721524912583, + "language_loss": 0.81697947, + "learning_rate": 9.111193604317304e-07, + "loss": 0.8395654, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.5307998657226562 + }, + { + "auxiliary_loss_clip": 0.01149501, + "auxiliary_loss_mlp": 0.01106611, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00065827, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.5802054985231346, + "language_loss": 0.76870775, + "learning_rate": 9.107927007835361e-07, + "loss": 0.79126883, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 5.429778099060059 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00171697, + "balance_loss_mlp": 1.00047898, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.7767514012082384, + "language_loss": 0.68222767, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70446467, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.5930511951446533 + }, + { + "auxiliary_loss_clip": 0.01117596, + "auxiliary_loss_mlp": 0.01108106, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00053132, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.601673312623195, + "language_loss": 0.64249229, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66474926, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.634989023208618 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01106614, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.0006609, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.1170454574021025, + "language_loss": 0.70194972, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72403842, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.6264867782592773 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.0110566, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00046968, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.4065472668863297, + "language_loss": 0.76179063, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78418905, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 2.6071903705596924 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00044513, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.6254256354410301, + "language_loss": 0.79494619, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81734812, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.578153133392334 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01105483, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00067389, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.5018973676055842, + "language_loss": 0.75989974, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78244525, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 3.9965860843658447 + }, + { + "auxiliary_loss_clip": 0.01165637, + "auxiliary_loss_mlp": 0.00747492, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.0005827, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.598294702669381, + "language_loss": 0.72486609, + "learning_rate": 9.085072404194436e-07, + "loss": 0.7439974, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.626000165939331 + }, + { + "auxiliary_loss_clip": 0.01132863, + "auxiliary_loss_mlp": 0.01108139, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00056481, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 2.1819214586899425, + "language_loss": 0.78116679, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80357683, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.606614112854004 + }, + { + "auxiliary_loss_clip": 0.0114915, + "auxiliary_loss_mlp": 0.01105861, + "balance_loss_clip": 1.00172603, + "balance_loss_mlp": 1.00057518, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.6925849596239704, + "language_loss": 0.69241625, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71496636, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.6272921562194824 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01108005, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00052595, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.4491372735680346, + "language_loss": 0.67049098, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69291425, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.5698580741882324 + }, + { + "auxiliary_loss_clip": 0.01132391, + "auxiliary_loss_mlp": 0.01107643, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00064075, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.2212113874825214, + "language_loss": 0.58898962, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61138994, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.5843071937561035 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01106511, + "balance_loss_clip": 1.00155783, + "balance_loss_mlp": 1.00036716, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 3.4071784457699046, + "language_loss": 0.71066326, + "learning_rate": 9.068760101685971e-07, + "loss": 0.7329213, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.614481210708618 + }, + { + "auxiliary_loss_clip": 0.0113021, + "auxiliary_loss_mlp": 0.01083562, + "balance_loss_clip": 1.00120139, + "balance_loss_mlp": 1.0000205, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 1.1259599092614736, + "language_loss": 0.59034634, + "learning_rate": 9.065498884230638e-07, + "loss": 0.6124841, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.263575553894043 + }, + { + "auxiliary_loss_clip": 0.01149477, + "auxiliary_loss_mlp": 0.00747477, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00054669, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.4986655578381534, + "language_loss": 0.72579455, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74476409, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.578315496444702 + }, + { + "auxiliary_loss_clip": 0.01147079, + "auxiliary_loss_mlp": 0.00746398, + "balance_loss_clip": 1.00130498, + "balance_loss_mlp": 1.00063169, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7431473459832479, + "language_loss": 0.55524492, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57417965, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.1538422107696533 + }, + { + "auxiliary_loss_clip": 0.01148963, + "auxiliary_loss_mlp": 0.01105867, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00067675, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.6208831933418437, + "language_loss": 0.77774656, + "learning_rate": 9.055717720183505e-07, + "loss": 0.80029488, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.6247522830963135 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_clip": 1.0017457, + "balance_loss_mlp": 1.00051272, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.6594944341024185, + "language_loss": 0.63859916, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66097611, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.6628000736236572 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00068808, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.4694232781915901, + "language_loss": 0.86900908, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89122236, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.668450355529785 + }, + { + "auxiliary_loss_clip": 0.0116594, + "auxiliary_loss_mlp": 0.00747602, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00042725, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.708616354307422, + "language_loss": 0.84571481, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86485022, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.4939870834350586 + }, + { + "auxiliary_loss_clip": 0.0114915, + "auxiliary_loss_mlp": 0.01107499, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00049675, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.7673449959227356, + "language_loss": 0.75627351, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77883995, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.5408833026885986 + }, + { + "auxiliary_loss_clip": 0.01133868, + "auxiliary_loss_mlp": 0.01106506, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.0006485, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.9140937373706197, + "language_loss": 0.76095426, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78335798, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.5755159854888916 + }, + { + "auxiliary_loss_clip": 0.01102952, + "auxiliary_loss_mlp": 0.01107331, + "balance_loss_clip": 1.00159585, + "balance_loss_mlp": 1.00061488, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.8120133553455262, + "language_loss": 0.71226794, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73437077, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.630342721939087 + }, + { + "auxiliary_loss_clip": 0.01149114, + "auxiliary_loss_mlp": 0.01105791, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00060034, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.5830741489511437, + "language_loss": 0.79473984, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81728888, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.5327987670898438 + }, + { + "auxiliary_loss_clip": 0.01119062, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.00049019, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.5390215177914137, + "language_loss": 0.78525954, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80751932, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.662075996398926 + }, + { + "auxiliary_loss_clip": 0.01132233, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00049591, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 1.9385532759510389, + "language_loss": 0.80801654, + "learning_rate": 9.026396651834834e-07, + "loss": 0.8268137, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.5685150623321533 + }, + { + "auxiliary_loss_clip": 0.01161253, + "auxiliary_loss_mlp": 0.00746352, + "balance_loss_clip": 1.00123382, + "balance_loss_mlp": 1.00054538, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6884905995098405, + "language_loss": 0.53710121, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55617726, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.098656177520752 + }, + { + "auxiliary_loss_clip": 0.01151151, + "auxiliary_loss_mlp": 0.01105927, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00064147, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.3824817226231156, + "language_loss": 0.73676449, + "learning_rate": 9.01988543302e-07, + "loss": 0.75933528, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 2.597993850708008 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.011072, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00067425, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.7448165277009178, + "language_loss": 0.74063748, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76303619, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 3.949641227722168 + }, + { + "auxiliary_loss_clip": 0.01165798, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00070763, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.4627962289421659, + "language_loss": 0.84726268, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86999106, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.538470506668091 + }, + { + "auxiliary_loss_clip": 0.01165896, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00078189, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 2.3209838631957127, + "language_loss": 0.6744231, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69714558, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 2.6158523559570312 + }, + { + "auxiliary_loss_clip": 0.01133817, + "auxiliary_loss_mlp": 0.0110786, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00047636, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.6156499402480218, + "language_loss": 0.79389608, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81631279, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 2.5822107791900635 + }, + { + "auxiliary_loss_clip": 0.01149091, + "auxiliary_loss_mlp": 0.01106223, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00055575, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 2.3426731897382287, + "language_loss": 0.72821212, + "learning_rate": 9.003614674565934e-07, + "loss": 0.75076526, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.518981456756592 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.01105853, + "balance_loss_clip": 1.00176775, + "balance_loss_mlp": 1.00047159, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 2.1738302345638414, + "language_loss": 0.77817756, + "learning_rate": 9.000361773333705e-07, + "loss": 0.8004058, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.65510630607605 + }, + { + "auxiliary_loss_clip": 0.0108557, + "auxiliary_loss_mlp": 0.01106592, + "balance_loss_clip": 1.00166297, + "balance_loss_mlp": 1.00073433, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.248408142446172, + "language_loss": 0.60167825, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62359989, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.7809972763061523 + }, + { + "auxiliary_loss_clip": 0.01120566, + "auxiliary_loss_mlp": 0.01106065, + "balance_loss_clip": 1.00175452, + "balance_loss_mlp": 1.00068378, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.6615290677160623, + "language_loss": 0.85480899, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87707531, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.5850963592529297 + }, + { + "auxiliary_loss_clip": 0.01151125, + "auxiliary_loss_mlp": 0.01106991, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.0006566, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.705115938379802, + "language_loss": 0.70281625, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72539735, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.604053020477295 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01106634, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00068045, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.4527332695988444, + "language_loss": 0.78656548, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80878949, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.6270928382873535 + }, + { + "auxiliary_loss_clip": 0.01132343, + "auxiliary_loss_mlp": 0.01105738, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00054812, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 2.061936491136653, + "language_loss": 0.76432019, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78670102, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 5.494824647903442 + }, + { + "auxiliary_loss_clip": 0.01165552, + "auxiliary_loss_mlp": 0.0110594, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.00055909, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.8548180188430088, + "language_loss": 0.78347254, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80618745, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.479882001876831 + }, + { + "auxiliary_loss_clip": 0.01150544, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00056565, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 2.1396750617695584, + "language_loss": 0.69572723, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71830642, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.5362658500671387 + }, + { + "auxiliary_loss_clip": 0.01147322, + "auxiliary_loss_mlp": 0.01105852, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.00056589, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.086108463590752, + "language_loss": 0.7340014, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75653315, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.529139757156372 + }, + { + "auxiliary_loss_clip": 0.01115302, + "auxiliary_loss_mlp": 0.01108464, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00060332, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.7092198442541369, + "language_loss": 0.71563959, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73787725, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.7143170833587646 + }, + { + "auxiliary_loss_clip": 0.01131724, + "auxiliary_loss_mlp": 0.01083943, + "balance_loss_clip": 1.00118542, + "balance_loss_mlp": 1.00001931, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9255509452956433, + "language_loss": 0.58471525, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60687184, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 3.0829391479492188 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01106899, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00056458, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.854376352933859, + "language_loss": 0.74270844, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76494813, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 4.0259315967559814 + }, + { + "auxiliary_loss_clip": 0.01134479, + "auxiliary_loss_mlp": 0.01106572, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00061834, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.265032258226846, + "language_loss": 0.76603484, + "learning_rate": 8.961359528185313e-07, + "loss": 0.78844535, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 2.6474969387054443 + }, + { + "auxiliary_loss_clip": 0.01147587, + "auxiliary_loss_mlp": 0.01106349, + "balance_loss_clip": 1.00209463, + "balance_loss_mlp": 1.00068212, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.730516845760726, + "language_loss": 0.725568, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74810743, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.5421926975250244 + }, + { + "auxiliary_loss_clip": 0.0113381, + "auxiliary_loss_mlp": 0.01105367, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00046313, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.91335323394242, + "language_loss": 0.76873136, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79112309, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.609992742538452 + }, + { + "auxiliary_loss_clip": 0.01148976, + "auxiliary_loss_mlp": 0.0110619, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00052309, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 1.7354371400874624, + "language_loss": 0.74270809, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76525974, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.570211410522461 + }, + { + "auxiliary_loss_clip": 0.01150723, + "auxiliary_loss_mlp": 0.01105669, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.00057387, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.7838482081831226, + "language_loss": 0.74768674, + "learning_rate": 8.948372164052118e-07, + "loss": 0.77025068, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.5058162212371826 + }, + { + "auxiliary_loss_clip": 0.01132267, + "auxiliary_loss_mlp": 0.01105591, + "balance_loss_clip": 1.00167167, + "balance_loss_mlp": 1.00068724, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.638667128157197, + "language_loss": 0.70590246, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72828102, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.692920207977295 + }, + { + "auxiliary_loss_clip": 0.01132598, + "auxiliary_loss_mlp": 0.01107937, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00074363, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.8550373161919975, + "language_loss": 0.74627578, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76868117, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.618887424468994 + }, + { + "auxiliary_loss_clip": 0.01119519, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_clip": 1.00182569, + "balance_loss_mlp": 1.00052512, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.8047633076006901, + "language_loss": 0.74777514, + "learning_rate": 8.938636040849014e-07, + "loss": 0.77003515, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.6319522857666016 + }, + { + "auxiliary_loss_clip": 0.0114901, + "auxiliary_loss_mlp": 0.01105944, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.0004673, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.4846187817859007, + "language_loss": 0.7856192, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80816877, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.5767245292663574 + }, + { + "auxiliary_loss_clip": 0.01098466, + "auxiliary_loss_mlp": 0.01106231, + "balance_loss_clip": 1.00158525, + "balance_loss_mlp": 1.00046873, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.5572411932855013, + "language_loss": 0.56820285, + "learning_rate": 8.932147389081985e-07, + "loss": 0.59024978, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.6419122219085693 + }, + { + "auxiliary_loss_clip": 0.01073271, + "auxiliary_loss_mlp": 0.01105746, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00046039, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.2990123914137492, + "language_loss": 0.76824039, + "learning_rate": 8.928903692678081e-07, + "loss": 0.7900306, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.8484838008880615 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01106636, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.0005877, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.874730886292617, + "language_loss": 0.79669535, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81893814, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 2.6240651607513428 + }, + { + "auxiliary_loss_clip": 0.01117431, + "auxiliary_loss_mlp": 0.01104964, + "balance_loss_clip": 1.00147176, + "balance_loss_mlp": 1.00044084, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.7503041000584734, + "language_loss": 0.72102934, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74325329, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.631863832473755 + }, + { + "auxiliary_loss_clip": 0.01133476, + "auxiliary_loss_mlp": 0.01106732, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00049305, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 2.254471729676261, + "language_loss": 0.66246873, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68487084, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.618396759033203 + }, + { + "auxiliary_loss_clip": 0.01165692, + "auxiliary_loss_mlp": 0.01105749, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00055909, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.073586948129787, + "language_loss": 0.76546597, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78818035, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.48854398727417 + }, + { + "auxiliary_loss_clip": 0.01134257, + "auxiliary_loss_mlp": 0.01106122, + "balance_loss_clip": 1.00183117, + "balance_loss_mlp": 1.00055075, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 2.6041038264856273, + "language_loss": 0.69930845, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72171223, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.543186664581299 + }, + { + "auxiliary_loss_clip": 0.01117974, + "auxiliary_loss_mlp": 0.01107298, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00067711, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 1.4862100771169298, + "language_loss": 0.82050395, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84275663, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 2.6294569969177246 + }, + { + "auxiliary_loss_clip": 0.01089486, + "auxiliary_loss_mlp": 0.01107638, + "balance_loss_clip": 1.00174046, + "balance_loss_mlp": 1.00063562, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.4369818103020453, + "language_loss": 0.7971611, + "learning_rate": 8.906209579615107e-07, + "loss": 0.81913227, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.7125496864318848 + }, + { + "auxiliary_loss_clip": 0.01165519, + "auxiliary_loss_mlp": 0.01106299, + "balance_loss_clip": 1.00184953, + "balance_loss_mlp": 1.00053632, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.5727788981066915, + "language_loss": 0.77801621, + "learning_rate": 8.90296924519055e-07, + "loss": 0.8007344, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.5383594036102295 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01104754, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00070775, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.7112550820304269, + "language_loss": 0.78577602, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80832654, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.547267198562622 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01105969, + "balance_loss_clip": 1.00191259, + "balance_loss_mlp": 1.00049281, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 1.9897279274004167, + "language_loss": 0.7304157, + "learning_rate": 8.896489838865857e-07, + "loss": 0.75279629, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.582383871078491 + }, + { + "auxiliary_loss_clip": 0.01132648, + "auxiliary_loss_mlp": 0.0110513, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00060713, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.8429081596747385, + "language_loss": 0.75554895, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77792668, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 4.038685083389282 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01105837, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00055134, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.787020113923593, + "language_loss": 0.63702899, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65942514, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 2.661038398742676 + }, + { + "auxiliary_loss_clip": 0.01096715, + "auxiliary_loss_mlp": 0.0108492, + "balance_loss_clip": 1.0013442, + "balance_loss_mlp": 1.00023353, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.754886290098509, + "language_loss": 0.61252785, + "learning_rate": 8.88677388753248e-07, + "loss": 0.6343441, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.356187105178833 + }, + { + "auxiliary_loss_clip": 0.01083668, + "auxiliary_loss_mlp": 0.00747367, + "balance_loss_clip": 1.00165677, + "balance_loss_mlp": 1.00044596, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.5556795820781935, + "language_loss": 0.69106293, + "learning_rate": 8.883536079753582e-07, + "loss": 0.7093733, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 2.923112392425537 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01105548, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00054896, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.4878497078589477, + "language_loss": 0.62419683, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64644361, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 2.6785945892333984 + }, + { + "auxiliary_loss_clip": 0.01131692, + "auxiliary_loss_mlp": 0.0110434, + "balance_loss_clip": 1.00178468, + "balance_loss_mlp": 1.00029433, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 2.0162207024271726, + "language_loss": 0.54417366, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56653398, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.639766216278076 + }, + { + "auxiliary_loss_clip": 0.01149133, + "auxiliary_loss_mlp": 0.01106046, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00047457, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 1.8351442913226366, + "language_loss": 0.77044821, + "learning_rate": 8.87382518613248e-07, + "loss": 0.79299998, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.5390264987945557 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.00747601, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00054014, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.114026620894439, + "language_loss": 0.71380305, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73260278, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.5636837482452393 + }, + { + "auxiliary_loss_clip": 0.01165826, + "auxiliary_loss_mlp": 0.01106399, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00063634, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.8111299670838044, + "language_loss": 0.76109588, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78381813, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.524118185043335 + }, + { + "auxiliary_loss_clip": 0.01150974, + "auxiliary_loss_mlp": 0.01106714, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00056994, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.7631382689568371, + "language_loss": 0.74937379, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77195066, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.54429292678833 + }, + { + "auxiliary_loss_clip": 0.0113429, + "auxiliary_loss_mlp": 0.01106016, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00054002, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 2.1351119639873413, + "language_loss": 0.89105064, + "learning_rate": 8.860883235222791e-07, + "loss": 0.9134537, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 5.400600433349609 + }, + { + "auxiliary_loss_clip": 0.01150347, + "auxiliary_loss_mlp": 0.0110734, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.00071955, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.384348124241983, + "language_loss": 0.69673359, + "learning_rate": 8.85764880317974e-07, + "loss": 0.7193104, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.5516796112060547 + }, + { + "auxiliary_loss_clip": 0.01117879, + "auxiliary_loss_mlp": 0.01106929, + "balance_loss_clip": 1.00171232, + "balance_loss_mlp": 1.00049949, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.5565078541273707, + "language_loss": 0.76611722, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78836536, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.680772304534912 + }, + { + "auxiliary_loss_clip": 0.01148708, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00051618, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.8286912832284408, + "language_loss": 0.72147262, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74043459, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.53344988822937 + }, + { + "auxiliary_loss_clip": 0.01134946, + "auxiliary_loss_mlp": 0.00747361, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00050902, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 3.0460233314594376, + "language_loss": 0.75968778, + "learning_rate": 8.847948042655567e-07, + "loss": 0.77851081, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.590503692626953 + }, + { + "auxiliary_loss_clip": 0.01100696, + "auxiliary_loss_mlp": 0.01105997, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00052071, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.516002299845676, + "language_loss": 0.62299579, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64506269, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.6519813537597656 + }, + { + "auxiliary_loss_clip": 0.01147437, + "auxiliary_loss_mlp": 0.01107392, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00058007, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.240402483200018, + "language_loss": 0.81742859, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83997691, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.5655486583709717 + }, + { + "auxiliary_loss_clip": 0.01148965, + "auxiliary_loss_mlp": 0.01105763, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00066781, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.5935676601842672, + "language_loss": 0.70532954, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72787678, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 3.9375860691070557 + }, + { + "auxiliary_loss_clip": 0.01132636, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00067425, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 1.705176717014803, + "language_loss": 0.82148397, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84387946, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.591660737991333 + }, + { + "auxiliary_loss_clip": 0.01132366, + "auxiliary_loss_mlp": 0.01106049, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.0005728, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.85839360351043, + "language_loss": 0.78843367, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81081784, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 2.5775747299194336 + }, + { + "auxiliary_loss_clip": 0.01135233, + "auxiliary_loss_mlp": 0.01105854, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.00056839, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.7323795601253276, + "language_loss": 0.89891016, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92132103, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.56996750831604 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01106752, + "balance_loss_clip": 1.00160503, + "balance_loss_mlp": 1.00051236, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 1.4583758648728982, + "language_loss": 0.64003474, + "learning_rate": 8.82532774152765e-07, + "loss": 0.6622799, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.62237811088562 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01104694, + "balance_loss_clip": 1.00169694, + "balance_loss_mlp": 1.0006485, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.699297708168884, + "language_loss": 0.84585083, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86806464, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.7319905757904053 + }, + { + "auxiliary_loss_clip": 0.01148663, + "auxiliary_loss_mlp": 0.01106611, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00056219, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.8420878062430233, + "language_loss": 0.71101624, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73356897, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.5348973274230957 + }, + { + "auxiliary_loss_clip": 0.01151172, + "auxiliary_loss_mlp": 0.01106404, + "balance_loss_clip": 1.00195372, + "balance_loss_mlp": 1.00054586, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.8601390051202282, + "language_loss": 0.81021917, + "learning_rate": 8.815639680478573e-07, + "loss": 0.8327949, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.5332653522491455 + }, + { + "auxiliary_loss_clip": 0.01150553, + "auxiliary_loss_mlp": 0.01105269, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00065088, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 2.273657592036112, + "language_loss": 0.754825, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77738321, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.585758686065674 + }, + { + "auxiliary_loss_clip": 0.010693, + "auxiliary_loss_mlp": 0.01106144, + "balance_loss_clip": 1.00165582, + "balance_loss_mlp": 1.00057268, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 1.9950409479735476, + "language_loss": 0.76916063, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79091513, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.771514892578125 + }, + { + "auxiliary_loss_clip": 0.01132486, + "auxiliary_loss_mlp": 0.01105542, + "balance_loss_clip": 1.00167632, + "balance_loss_mlp": 1.00054288, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.146421652736797, + "language_loss": 0.72919321, + "learning_rate": 8.80595543643797e-07, + "loss": 0.7515735, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.574580669403076 + }, + { + "auxiliary_loss_clip": 0.01165681, + "auxiliary_loss_mlp": 0.01106182, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00070524, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.6428606838308542, + "language_loss": 0.84161955, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86433816, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.532212972640991 + }, + { + "auxiliary_loss_clip": 0.01120134, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_clip": 1.00188315, + "balance_loss_mlp": 1.00065112, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.4052012308840234, + "language_loss": 0.59662777, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61889231, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.6287789344787598 + }, + { + "auxiliary_loss_clip": 0.01133893, + "auxiliary_loss_mlp": 0.01106443, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00068021, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.626538062146351, + "language_loss": 0.83009452, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85249794, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.6438167095184326 + }, + { + "auxiliary_loss_clip": 0.0114895, + "auxiliary_loss_mlp": 0.01105607, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00051188, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 2.0959499406992093, + "language_loss": 0.67270529, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69525081, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.696472644805908 + }, + { + "auxiliary_loss_clip": 0.01087705, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00052094, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.409539814063716, + "language_loss": 0.72917521, + "learning_rate": 8.789823520920794e-07, + "loss": 0.7511133, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.6788229942321777 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01107083, + "balance_loss_clip": 1.00197661, + "balance_loss_mlp": 1.00065255, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.9437654255225096, + "language_loss": 0.68172562, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70380163, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.691035747528076 + }, + { + "auxiliary_loss_clip": 0.01084103, + "auxiliary_loss_mlp": 0.01104932, + "balance_loss_clip": 1.00161254, + "balance_loss_mlp": 1.00040889, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.5819122845021614, + "language_loss": 0.62529993, + "learning_rate": 8.783373729494721e-07, + "loss": 0.64719033, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.673415422439575 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.0110657, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00042605, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.8106080827934, + "language_loss": 0.60759974, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63032478, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.644925594329834 + }, + { + "auxiliary_loss_clip": 0.01149012, + "auxiliary_loss_mlp": 0.01106716, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.0006671, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6819301647719853, + "language_loss": 0.78039664, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80295384, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.518298387527466 + }, + { + "auxiliary_loss_clip": 0.01117749, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_clip": 1.001876, + "balance_loss_mlp": 1.00069034, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.812937698891482, + "language_loss": 0.65895212, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68118268, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 2.6233320236206055 + }, + { + "auxiliary_loss_clip": 0.01134226, + "auxiliary_loss_mlp": 0.00747557, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00052416, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.6071409999744235, + "language_loss": 0.70233518, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72115302, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 4.029844522476196 + }, + { + "auxiliary_loss_clip": 0.01165678, + "auxiliary_loss_mlp": 0.01104791, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00055456, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.5900802469822526, + "language_loss": 0.62578022, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64848495, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 2.5082991123199463 + }, + { + "auxiliary_loss_clip": 0.01151195, + "auxiliary_loss_mlp": 0.01106768, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00052881, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.0247215782464507, + "language_loss": 0.68865448, + "learning_rate": 8.764034567182581e-07, + "loss": 0.71123415, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 2.6547274589538574 + }, + { + "auxiliary_loss_clip": 0.01165734, + "auxiliary_loss_mlp": 0.01105846, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00075138, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6508438182178222, + "language_loss": 0.72259641, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74531221, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.507477045059204 + }, + { + "auxiliary_loss_clip": 0.01165811, + "auxiliary_loss_mlp": 0.01106066, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.00078034, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.5631743551004964, + "language_loss": 0.74010479, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76282358, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.508939266204834 + }, + { + "auxiliary_loss_clip": 0.0114864, + "auxiliary_loss_mlp": 0.01107295, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.00067472, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.0998256347489663, + "language_loss": 0.89709187, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91965127, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.528653383255005 + }, + { + "auxiliary_loss_clip": 0.01132604, + "auxiliary_loss_mlp": 0.01106744, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.0007906, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.5842910506409482, + "language_loss": 0.79667866, + "learning_rate": 8.751150312056792e-07, + "loss": 0.81907213, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.5883827209472656 + }, + { + "auxiliary_loss_clip": 0.01165908, + "auxiliary_loss_mlp": 0.01107027, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00050163, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 2.0913592484141246, + "language_loss": 0.67018008, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69290942, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.5799055099487305 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_clip": 1.00116718, + "balance_loss_mlp": 1.00005674, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 1.0296300799914855, + "language_loss": 0.53153199, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55352908, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.3318514823913574 + }, + { + "auxiliary_loss_clip": 0.0113278, + "auxiliary_loss_mlp": 0.01106423, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.5839240112921928, + "language_loss": 0.81991678, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84230882, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 5.5519890785217285 + }, + { + "auxiliary_loss_clip": 0.01165759, + "auxiliary_loss_mlp": 0.01106489, + "balance_loss_clip": 1.00189388, + "balance_loss_mlp": 1.00053585, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 1.9392465384199276, + "language_loss": 0.82699776, + "learning_rate": 8.738272881850801e-07, + "loss": 0.8497203, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.5274770259857178 + }, + { + "auxiliary_loss_clip": 0.01100717, + "auxiliary_loss_mlp": 0.01106471, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00061333, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 1.8191753458027282, + "language_loss": 0.67838562, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70045745, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.6366469860076904 + }, + { + "auxiliary_loss_clip": 0.01149087, + "auxiliary_loss_mlp": 0.01106647, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00059819, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 4.394118370164498, + "language_loss": 0.78214645, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80470377, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.6074061393737793 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01106901, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00075698, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 1.9416360447019925, + "language_loss": 0.82060063, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84301758, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.5922422409057617 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_clip": 1.00171614, + "balance_loss_mlp": 1.00061727, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.703767691396493, + "language_loss": 0.75343746, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77565479, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.672443151473999 + }, + { + "auxiliary_loss_clip": 0.01134416, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00049233, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 2.4929129987233707, + "language_loss": 0.78258395, + "learning_rate": 8.722185703539022e-07, + "loss": 0.8049916, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.581188201904297 + }, + { + "auxiliary_loss_clip": 0.0115102, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00057983, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.19969621503184, + "language_loss": 0.74266374, + "learning_rate": 8.718969550356266e-07, + "loss": 0.76524878, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 4.015138626098633 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01105857, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.0004766, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.4793323896551398, + "language_loss": 0.60106492, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62329048, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.686065435409546 + }, + { + "auxiliary_loss_clip": 0.01150846, + "auxiliary_loss_mlp": 0.01106564, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.6035120061609993, + "language_loss": 0.8163349, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83890897, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.5317862033843994 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00065184, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.8331673923622724, + "language_loss": 0.68343973, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70600474, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 2.5286059379577637 + }, + { + "auxiliary_loss_clip": 0.0115072, + "auxiliary_loss_mlp": 0.01105458, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00055385, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.4681840554146297, + "language_loss": 0.71256286, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73512465, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 2.5584163665771484 + }, + { + "auxiliary_loss_clip": 0.01149165, + "auxiliary_loss_mlp": 0.01107207, + "balance_loss_clip": 1.00206077, + "balance_loss_mlp": 1.00058651, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.5292425992177086, + "language_loss": 0.71549821, + "learning_rate": 8.702895203548155e-07, + "loss": 0.7380619, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.6853830814361572 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01106588, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00053966, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.4665801182712563, + "language_loss": 0.77361488, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79570997, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.7031872272491455 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.0110564, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00054526, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 7.03852554443198, + "language_loss": 0.78392285, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80631793, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.541539430618286 + }, + { + "auxiliary_loss_clip": 0.01131809, + "auxiliary_loss_mlp": 0.01105551, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 1.9514693145851012, + "language_loss": 0.78454024, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80691385, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.5610005855560303 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.01106532, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00067401, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.670346908921694, + "language_loss": 0.69372141, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71596205, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.6109671592712402 + }, + { + "auxiliary_loss_clip": 0.01150508, + "auxiliary_loss_mlp": 0.01105953, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00047636, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.322529435841499, + "language_loss": 0.74389929, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76646388, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.5814688205718994 + }, + { + "auxiliary_loss_clip": 0.01130989, + "auxiliary_loss_mlp": 0.01106414, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.0004611, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 1.8834418488195366, + "language_loss": 0.70554262, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72791672, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.635376214981079 + }, + { + "auxiliary_loss_clip": 0.0110072, + "auxiliary_loss_mlp": 0.0110682, + "balance_loss_clip": 1.00156331, + "balance_loss_mlp": 1.00048566, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.332791768453204, + "language_loss": 0.72847474, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75055015, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.625838279724121 + }, + { + "auxiliary_loss_clip": 0.01149286, + "auxiliary_loss_mlp": 0.01107281, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00056434, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.7260622975590898, + "language_loss": 0.70183718, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72440279, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.545708656311035 + }, + { + "auxiliary_loss_clip": 0.01102234, + "auxiliary_loss_mlp": 0.01105516, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00061142, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.606340463234027, + "language_loss": 0.78106284, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80314028, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.7287933826446533 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01106934, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00059891, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 1.9743880825937334, + "language_loss": 0.78295696, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80568492, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.4964466094970703 + }, + { + "auxiliary_loss_clip": 0.01134151, + "auxiliary_loss_mlp": 0.01104898, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 1.7389169614007287, + "language_loss": 0.82440901, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84679949, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.5628814697265625 + }, + { + "auxiliary_loss_clip": 0.0111584, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_clip": 1.00190866, + "balance_loss_mlp": 1.00051808, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 2.0866098979727994, + "language_loss": 0.69260341, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71481895, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.598921537399292 + }, + { + "auxiliary_loss_clip": 0.0116581, + "auxiliary_loss_mlp": 0.01107388, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00057662, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 2.187583024249612, + "language_loss": 0.81022823, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83296013, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.5073251724243164 + }, + { + "auxiliary_loss_clip": 0.01148796, + "auxiliary_loss_mlp": 0.01105484, + "balance_loss_clip": 1.00172424, + "balance_loss_mlp": 1.00057983, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.5283331697401523, + "language_loss": 0.78977978, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81232262, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.6064090728759766 + }, + { + "auxiliary_loss_clip": 0.01150817, + "auxiliary_loss_mlp": 0.01106439, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00048566, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 2.3011414185175765, + "language_loss": 0.83738559, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85995811, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.508103132247925 + }, + { + "auxiliary_loss_clip": 0.01144035, + "auxiliary_loss_mlp": 0.01083359, + "balance_loss_clip": 1.00108194, + "balance_loss_mlp": 1.00019872, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8154419432725937, + "language_loss": 0.53862917, + "learning_rate": 8.651529337861209e-07, + "loss": 0.56090307, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.1059553623199463 + }, + { + "auxiliary_loss_clip": 0.01134133, + "auxiliary_loss_mlp": 0.01106018, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00054169, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 1.8688140251968268, + "language_loss": 0.79071736, + "learning_rate": 8.64832262393344e-07, + "loss": 0.81311882, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 4.069654941558838 + }, + { + "auxiliary_loss_clip": 0.01150819, + "auxiliary_loss_mlp": 0.01105405, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00050092, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 1.9519813902853629, + "language_loss": 0.76876593, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79132819, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 2.4860143661499023 + }, + { + "auxiliary_loss_clip": 0.01150948, + "auxiliary_loss_mlp": 0.01106681, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00063205, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.7883339448203555, + "language_loss": 0.80980349, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83237976, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.5492613315582275 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01106569, + "balance_loss_clip": 1.00169802, + "balance_loss_mlp": 1.00071156, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.0751598328147005, + "language_loss": 0.65035868, + "learning_rate": 8.638705065376879e-07, + "loss": 0.6725949, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.634068489074707 + }, + { + "auxiliary_loss_clip": 0.01133935, + "auxiliary_loss_mlp": 0.01105701, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00051117, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.275286190891169, + "language_loss": 0.76483274, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78722906, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.5880465507507324 + }, + { + "auxiliary_loss_clip": 0.01130292, + "auxiliary_loss_mlp": 0.01083705, + "balance_loss_clip": 1.00128984, + "balance_loss_mlp": 1.00016308, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6983838448862603, + "language_loss": 0.54453009, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56667006, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.290400266647339 + }, + { + "auxiliary_loss_clip": 0.01132535, + "auxiliary_loss_mlp": 0.01105722, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00053167, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.6192400434363092, + "language_loss": 0.81620258, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83858514, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.5592777729034424 + }, + { + "auxiliary_loss_clip": 0.01151227, + "auxiliary_loss_mlp": 0.0110722, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00059938, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 3.9735319793008443, + "language_loss": 0.75292242, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77550691, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.531933546066284 + }, + { + "auxiliary_loss_clip": 0.0115117, + "auxiliary_loss_mlp": 0.01105186, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.0006628, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.888980544638264, + "language_loss": 0.8652041, + "learning_rate": 8.622684419164883e-07, + "loss": 0.88776767, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.515064239501953 + }, + { + "auxiliary_loss_clip": 0.01150793, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.0005182, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.761620651884069, + "language_loss": 0.73180807, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75436354, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 5.30720329284668 + }, + { + "auxiliary_loss_clip": 0.01147396, + "auxiliary_loss_mlp": 0.00747319, + "balance_loss_clip": 1.00216675, + "balance_loss_mlp": 1.00042796, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 2.057128928007804, + "language_loss": 0.72122842, + "learning_rate": 8.616279179832329e-07, + "loss": 0.7401756, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.5555152893066406 + }, + { + "auxiliary_loss_clip": 0.0111756, + "auxiliary_loss_mlp": 0.0110685, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00051546, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 3.307922567860362, + "language_loss": 0.51088899, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53313309, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.6458261013031006 + }, + { + "auxiliary_loss_clip": 0.01127529, + "auxiliary_loss_mlp": 0.00746565, + "balance_loss_clip": 1.00115585, + "balance_loss_mlp": 1.00088012, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7279066429704507, + "language_loss": 0.59216106, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61090207, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.236238479614258 + }, + { + "auxiliary_loss_clip": 0.01150792, + "auxiliary_loss_mlp": 0.01106066, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00039911, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 1.8080386419198633, + "language_loss": 0.62766087, + "learning_rate": 8.606674558675737e-07, + "loss": 0.65022945, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.588815450668335 + }, + { + "auxiliary_loss_clip": 0.0116573, + "auxiliary_loss_mlp": 0.01105998, + "balance_loss_clip": 1.00199163, + "balance_loss_mlp": 1.00061738, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.5735764450403045, + "language_loss": 0.7930519, + "learning_rate": 8.603473882200444e-07, + "loss": 0.8157692, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.493726968765259 + }, + { + "auxiliary_loss_clip": 0.01136061, + "auxiliary_loss_mlp": 0.01105975, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.0005939, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.2811525295507007, + "language_loss": 0.70796192, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73038232, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 3.957836627960205 + }, + { + "auxiliary_loss_clip": 0.01119769, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_clip": 1.00191915, + "balance_loss_mlp": 1.00057781, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.6353161618762115, + "language_loss": 0.7495926, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77186036, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.6007707118988037 + }, + { + "auxiliary_loss_clip": 0.01135996, + "auxiliary_loss_mlp": 0.01105504, + "balance_loss_clip": 1.00169802, + "balance_loss_mlp": 1.0005995, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.429635881458535, + "language_loss": 0.76605582, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78847086, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.603663206100464 + }, + { + "auxiliary_loss_clip": 0.01115667, + "auxiliary_loss_mlp": 0.00747322, + "balance_loss_clip": 1.00177038, + "balance_loss_mlp": 1.00038815, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.160032826126814, + "language_loss": 0.73574835, + "learning_rate": 8.590675499086841e-07, + "loss": 0.7543782, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.576258420944214 + }, + { + "auxiliary_loss_clip": 0.01117363, + "auxiliary_loss_mlp": 0.01106605, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00065148, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.6379949547110264, + "language_loss": 0.71682191, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73906159, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.6682510375976562 + }, + { + "auxiliary_loss_clip": 0.01150938, + "auxiliary_loss_mlp": 0.01106268, + "balance_loss_clip": 1.001863, + "balance_loss_mlp": 1.00060129, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8424171854156504, + "language_loss": 0.71628892, + "learning_rate": 8.584278902901128e-07, + "loss": 0.73886096, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.5521509647369385 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01105948, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00056672, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 2.950561688680614, + "language_loss": 0.8445977, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86716747, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.5267107486724854 + }, + { + "auxiliary_loss_clip": 0.01145458, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_clip": 1.00169885, + "balance_loss_mlp": 1.00030565, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9734257558426564, + "language_loss": 0.7003842, + "learning_rate": 8.577884038256566e-07, + "loss": 0.72267342, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.2767727375030518 + }, + { + "auxiliary_loss_clip": 0.01119296, + "auxiliary_loss_mlp": 0.01105388, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00057936, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.8123187454378145, + "language_loss": 0.77236736, + "learning_rate": 8.574687255565329e-07, + "loss": 0.7946142, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.6252799034118652 + }, + { + "auxiliary_loss_clip": 0.01165747, + "auxiliary_loss_mlp": 0.01105345, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00072694, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.063603338448542, + "language_loss": 0.68484366, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70755458, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.500746011734009 + }, + { + "auxiliary_loss_clip": 0.01132391, + "auxiliary_loss_mlp": 0.01106627, + "balance_loss_clip": 1.00180447, + "balance_loss_mlp": 1.00057888, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 1.9558322240317176, + "language_loss": 0.79488146, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81727171, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.5707714557647705 + }, + { + "auxiliary_loss_clip": 0.01165701, + "auxiliary_loss_mlp": 0.01106364, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00079203, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.5454461662470642, + "language_loss": 0.75773114, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78045177, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.5023787021636963 + }, + { + "auxiliary_loss_clip": 0.01133709, + "auxiliary_loss_mlp": 0.01105332, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00052285, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.7990051334392283, + "language_loss": 0.81695914, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83934951, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.5707104206085205 + }, + { + "auxiliary_loss_clip": 0.01132217, + "auxiliary_loss_mlp": 0.01105669, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00047874, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.481486898839284, + "language_loss": 0.76632011, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78869903, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.5836033821105957 + }, + { + "auxiliary_loss_clip": 0.01134194, + "auxiliary_loss_mlp": 0.01106405, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00064242, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.71440818392818, + "language_loss": 0.6805985, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70300454, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.7211592197418213 + }, + { + "auxiliary_loss_clip": 0.01165733, + "auxiliary_loss_mlp": 0.01105726, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00053561, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.059966949671178, + "language_loss": 0.75568712, + "learning_rate": 8.552321914485203e-07, + "loss": 0.77840173, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.4640402793884277 + }, + { + "auxiliary_loss_clip": 0.01133818, + "auxiliary_loss_mlp": 0.01107565, + "balance_loss_clip": 1.00194728, + "balance_loss_mlp": 1.0006578, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 1.8750227467793807, + "language_loss": 0.74387473, + "learning_rate": 8.549128601178852e-07, + "loss": 0.76628864, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.5525336265563965 + }, + { + "auxiliary_loss_clip": 0.01136132, + "auxiliary_loss_mlp": 0.0110618, + "balance_loss_clip": 1.0017637, + "balance_loss_mlp": 1.00070381, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.566616588256824, + "language_loss": 0.75510675, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77752984, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.6113829612731934 + }, + { + "auxiliary_loss_clip": 0.01099724, + "auxiliary_loss_mlp": 0.01106336, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00057364, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 2.2411924638097878, + "language_loss": 0.80927038, + "learning_rate": 8.542743277341793e-07, + "loss": 0.83133101, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.6177515983581543 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00067663, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 2.3085923301448115, + "language_loss": 0.84794617, + "learning_rate": 8.539551267053222e-07, + "loss": 0.87035245, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.5740654468536377 + }, + { + "auxiliary_loss_clip": 0.01151087, + "auxiliary_loss_mlp": 0.0110657, + "balance_loss_clip": 1.00202596, + "balance_loss_mlp": 1.00061691, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.8925044174218273, + "language_loss": 0.79089856, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81347513, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.5306789875030518 + }, + { + "auxiliary_loss_clip": 0.01148958, + "auxiliary_loss_mlp": 0.01105983, + "balance_loss_clip": 1.00181115, + "balance_loss_mlp": 1.0005064, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.6859101083667205, + "language_loss": 0.74589646, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76844585, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.642317771911621 + }, + { + "auxiliary_loss_clip": 0.01150307, + "auxiliary_loss_mlp": 0.01106327, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00056422, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.208427910812571, + "language_loss": 0.84003705, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86260343, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 3.902219533920288 + }, + { + "auxiliary_loss_clip": 0.01165816, + "auxiliary_loss_mlp": 0.01106032, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00055587, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 1.6277998049319637, + "language_loss": 0.60822761, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63094604, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 2.5028295516967773 + }, + { + "auxiliary_loss_clip": 0.01165626, + "auxiliary_loss_mlp": 0.01106035, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.8374308822947238, + "language_loss": 0.60906255, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63177919, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.5501644611358643 + }, + { + "auxiliary_loss_clip": 0.01149087, + "auxiliary_loss_mlp": 0.01104883, + "balance_loss_clip": 1.00168633, + "balance_loss_mlp": 1.00055146, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.5049863655490636, + "language_loss": 0.70693469, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72947437, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 2.5887718200683594 + }, + { + "auxiliary_loss_clip": 0.01148982, + "auxiliary_loss_mlp": 0.01105692, + "balance_loss_clip": 1.0018965, + "balance_loss_mlp": 1.00059748, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.642525327032146, + "language_loss": 0.61582118, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63836795, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.5461087226867676 + }, + { + "auxiliary_loss_clip": 0.0115009, + "auxiliary_loss_mlp": 0.01106279, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00061154, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.111734200792416, + "language_loss": 0.68753445, + "learning_rate": 8.514030839837756e-07, + "loss": 0.71009815, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.5328779220581055 + }, + { + "auxiliary_loss_clip": 0.01165635, + "auxiliary_loss_mlp": 0.01106059, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00058246, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.756344620683358, + "language_loss": 0.7640686, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78678554, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.5275864601135254 + }, + { + "auxiliary_loss_clip": 0.01136188, + "auxiliary_loss_mlp": 0.01105305, + "balance_loss_clip": 1.00191391, + "balance_loss_mlp": 1.00059152, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.5924003257440564, + "language_loss": 0.72133106, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74374604, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 2.543570041656494 + }, + { + "auxiliary_loss_clip": 0.01148867, + "auxiliary_loss_mlp": 0.01105647, + "balance_loss_clip": 1.00178683, + "balance_loss_mlp": 1.00064719, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.0294553601037397, + "language_loss": 0.79041529, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81296039, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.512317419052124 + }, + { + "auxiliary_loss_clip": 0.01150543, + "auxiliary_loss_mlp": 0.01106388, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00062561, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 2.394054818691773, + "language_loss": 0.77181524, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79438448, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 3.9184396266937256 + }, + { + "auxiliary_loss_clip": 0.01117557, + "auxiliary_loss_mlp": 0.01105616, + "balance_loss_clip": 1.00185454, + "balance_loss_mlp": 1.00052142, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.1891070722897177, + "language_loss": 0.73778415, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76001585, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 4.006348371505737 + }, + { + "auxiliary_loss_clip": 0.01109284, + "auxiliary_loss_mlp": 0.0108346, + "balance_loss_clip": 1.00111258, + "balance_loss_mlp": 1.00029933, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8802788185699018, + "language_loss": 0.64604336, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66797084, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.231086015701294 + }, + { + "auxiliary_loss_clip": 0.01150844, + "auxiliary_loss_mlp": 0.01105858, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.00066769, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 1.8710518067149435, + "language_loss": 0.72811532, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75068235, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 2.583221197128296 + }, + { + "auxiliary_loss_clip": 0.01132216, + "auxiliary_loss_mlp": 0.0074765, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00062752, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.625218417479387, + "language_loss": 0.79675519, + "learning_rate": 8.488538287759248e-07, + "loss": 0.8155539, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.6051437854766846 + }, + { + "auxiliary_loss_clip": 0.01134616, + "auxiliary_loss_mlp": 0.01106813, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.000669, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.947414568187915, + "language_loss": 0.71395075, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73636508, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.5451505184173584 + }, + { + "auxiliary_loss_clip": 0.01103524, + "auxiliary_loss_mlp": 0.01106128, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00055599, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 2.465522000792402, + "language_loss": 0.66632539, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68842196, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 2.749528646469116 + }, + { + "auxiliary_loss_clip": 0.01165739, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00060272, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 2.6152871381070333, + "language_loss": 0.74096316, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76367849, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 3.936218500137329 + }, + { + "auxiliary_loss_clip": 0.01150932, + "auxiliary_loss_mlp": 0.01106053, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00067198, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 2.1825240935599974, + "language_loss": 0.79795498, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82052487, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.5703704357147217 + }, + { + "auxiliary_loss_clip": 0.01148907, + "auxiliary_loss_mlp": 0.01105954, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00076413, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 3.4864840694417096, + "language_loss": 0.65589011, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67843878, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.7020657062530518 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0110627, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.1569915856677255, + "language_loss": 0.80030519, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82270777, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.540703296661377 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01083193, + "balance_loss_clip": 1.0012356, + "balance_loss_mlp": 1.00003242, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7666689174642703, + "language_loss": 0.64767742, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66963899, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.3046412467956543 + }, + { + "auxiliary_loss_clip": 0.01136295, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00056744, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.8571140345860602, + "language_loss": 0.65731454, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67974073, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.5909030437469482 + }, + { + "auxiliary_loss_clip": 0.01119185, + "auxiliary_loss_mlp": 0.01105271, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00065255, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.731494000980905, + "language_loss": 0.80879319, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83103776, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.615837812423706 + }, + { + "auxiliary_loss_clip": 0.01149084, + "auxiliary_loss_mlp": 0.01107197, + "balance_loss_clip": 1.00187552, + "balance_loss_mlp": 1.00057662, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.6127526349071428, + "language_loss": 0.73083997, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75340283, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.525251865386963 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.01106479, + "balance_loss_clip": 1.00169992, + "balance_loss_mlp": 1.00052547, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 1.9604000978093301, + "language_loss": 0.78331548, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80542791, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.624260902404785 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01105479, + "balance_loss_clip": 1.00170779, + "balance_loss_mlp": 1.00057483, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.7551050883469237, + "language_loss": 0.70646203, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72885829, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.615525722503662 + }, + { + "auxiliary_loss_clip": 0.01165397, + "auxiliary_loss_mlp": 0.00747499, + "balance_loss_clip": 1.00181341, + "balance_loss_mlp": 1.00053, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.565827579662724, + "language_loss": 0.69070256, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70983154, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.57021164894104 + }, + { + "auxiliary_loss_clip": 0.01117328, + "auxiliary_loss_mlp": 0.00747432, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00047946, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 2.006605037078388, + "language_loss": 0.73435789, + "learning_rate": 8.443993582217803e-07, + "loss": 0.7530055, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.661058187484741 + }, + { + "auxiliary_loss_clip": 0.0113545, + "auxiliary_loss_mlp": 0.01106289, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.5523737350435642, + "language_loss": 0.78277701, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80519438, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.6092708110809326 + }, + { + "auxiliary_loss_clip": 0.01165811, + "auxiliary_loss_mlp": 0.01106974, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00073481, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.211068089585857, + "language_loss": 0.62966543, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65239334, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.503161668777466 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01106645, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00050151, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 1.8711761266126103, + "language_loss": 0.74197608, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76405931, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.6214075088500977 + }, + { + "auxiliary_loss_clip": 0.01151152, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00049067, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.3968173157267134, + "language_loss": 0.71254426, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73510689, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.5525989532470703 + }, + { + "auxiliary_loss_clip": 0.01116855, + "auxiliary_loss_mlp": 0.01105485, + "balance_loss_clip": 1.00172472, + "balance_loss_mlp": 1.00067568, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.9336818089408885, + "language_loss": 0.73518348, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75740683, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.5992488861083984 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00073242, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.4968583151719153, + "language_loss": 0.69181406, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71406388, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.597470998764038 + }, + { + "auxiliary_loss_clip": 0.01134058, + "auxiliary_loss_mlp": 0.01107054, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00081503, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.0383738063434627, + "language_loss": 0.72558486, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74799597, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.5696945190429688 + }, + { + "auxiliary_loss_clip": 0.01135675, + "auxiliary_loss_mlp": 0.01105421, + "balance_loss_clip": 1.00195634, + "balance_loss_mlp": 1.00051713, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.7355147458167406, + "language_loss": 0.69399619, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71640718, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.5902535915374756 + }, + { + "auxiliary_loss_clip": 0.01115608, + "auxiliary_loss_mlp": 0.01106939, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00079477, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 1.8934350382169363, + "language_loss": 0.67256302, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69478846, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.5797595977783203 + }, + { + "auxiliary_loss_clip": 0.01165868, + "auxiliary_loss_mlp": 0.01107064, + "balance_loss_clip": 1.00206411, + "balance_loss_mlp": 1.0006336, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.6429507566294608, + "language_loss": 0.75220329, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77493262, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.748534917831421 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.00184059, + "balance_loss_mlp": 1.00050998, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.5687584965433194, + "language_loss": 0.7128762, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73168921, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 4.045434474945068 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01104889, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00055707, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6586575971109208, + "language_loss": 0.82129145, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84352565, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 2.6167232990264893 + }, + { + "auxiliary_loss_clip": 0.01132206, + "auxiliary_loss_mlp": 0.01105937, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.0006516, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 1.7607419052955655, + "language_loss": 0.77671051, + "learning_rate": 8.402707539225993e-07, + "loss": 0.79909194, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.560802698135376 + }, + { + "auxiliary_loss_clip": 0.01165827, + "auxiliary_loss_mlp": 0.01106095, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00061822, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.5752130973916063, + "language_loss": 0.64031327, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66303253, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 2.5688581466674805 + }, + { + "auxiliary_loss_clip": 0.01136077, + "auxiliary_loss_mlp": 0.01106304, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00054121, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 1.9313355661539942, + "language_loss": 0.65958458, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68200839, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.539860486984253 + }, + { + "auxiliary_loss_clip": 0.01150816, + "auxiliary_loss_mlp": 0.01106167, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.0006907, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.9850026852001939, + "language_loss": 0.63803208, + "learning_rate": 8.393190535704857e-07, + "loss": 0.66060197, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.5347516536712646 + }, + { + "auxiliary_loss_clip": 0.0112137, + "auxiliary_loss_mlp": 0.01106396, + "balance_loss_clip": 1.00176752, + "balance_loss_mlp": 1.00053775, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.6880975289781786, + "language_loss": 0.71857452, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74085224, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.662233591079712 + }, + { + "auxiliary_loss_clip": 0.01084146, + "auxiliary_loss_mlp": 0.011061, + "balance_loss_clip": 1.00171936, + "balance_loss_mlp": 1.0006237, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.4029930832599609, + "language_loss": 0.79264605, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81454849, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.754611015319824 + }, + { + "auxiliary_loss_clip": 0.01151022, + "auxiliary_loss_mlp": 0.01105265, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00064731, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 2.4000148407742685, + "language_loss": 0.64741737, + "learning_rate": 8.383677493366031e-07, + "loss": 0.66998023, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01100307, + "auxiliary_loss_mlp": 0.01106842, + "balance_loss_clip": 1.00160313, + "balance_loss_mlp": 1.00060308, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.855094930156674, + "language_loss": 0.79614162, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81821311, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 4.069141626358032 + }, + { + "auxiliary_loss_clip": 0.01161237, + "auxiliary_loss_mlp": 0.01083235, + "balance_loss_clip": 1.0012635, + "balance_loss_mlp": 1.00007439, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7890751714316061, + "language_loss": 0.54041708, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56286174, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 4.415971755981445 + }, + { + "auxiliary_loss_clip": 0.01133658, + "auxiliary_loss_mlp": 0.01105915, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00062931, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.6798208632438631, + "language_loss": 0.78931302, + "learning_rate": 8.37416841545612e-07, + "loss": 0.81170875, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.6101086139678955 + }, + { + "auxiliary_loss_clip": 0.01114962, + "auxiliary_loss_mlp": 0.01104644, + "balance_loss_clip": 1.00174463, + "balance_loss_mlp": 1.00050306, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 2.9168868083286092, + "language_loss": 0.67778873, + "learning_rate": 8.370999604364634e-07, + "loss": 0.69998473, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.6089720726013184 + }, + { + "auxiliary_loss_clip": 0.01087972, + "auxiliary_loss_mlp": 0.0074758, + "balance_loss_clip": 1.00171638, + "balance_loss_mlp": 1.0005672, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.0790496876778364, + "language_loss": 0.76573277, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78408831, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 2.7228312492370605 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.0074735, + "balance_loss_clip": 1.00164151, + "balance_loss_mlp": 1.0005883, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.5642569596585914, + "language_loss": 0.70939171, + "learning_rate": 8.364663305220405e-07, + "loss": 0.72804761, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.648165225982666 + }, + { + "auxiliary_loss_clip": 0.01118277, + "auxiliary_loss_mlp": 0.01105936, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00065053, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.5340312058294372, + "language_loss": 0.89062893, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91287106, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.6054351329803467 + }, + { + "auxiliary_loss_clip": 0.01132167, + "auxiliary_loss_mlp": 0.00747541, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00054312, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.6108561501052772, + "language_loss": 0.79607302, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81487012, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 3.9787328243255615 + }, + { + "auxiliary_loss_clip": 0.01100262, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_clip": 1.00131321, + "balance_loss_mlp": 0.99999446, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8364990216748432, + "language_loss": 0.6029886, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62481892, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 2.995868444442749 + }, + { + "auxiliary_loss_clip": 0.01119718, + "auxiliary_loss_mlp": 0.01106495, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00073266, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 2.028198510966695, + "language_loss": 0.80181187, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82407403, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.5923330783843994 + }, + { + "auxiliary_loss_clip": 0.01113867, + "auxiliary_loss_mlp": 0.00747546, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00054097, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 1.814210492117054, + "language_loss": 0.7733016, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79191566, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.7819130420684814 + }, + { + "auxiliary_loss_clip": 0.01150867, + "auxiliary_loss_mlp": 0.01105699, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.0005089, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.4642439673632628, + "language_loss": 0.67757595, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70014167, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.5577404499053955 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01105819, + "balance_loss_clip": 1.00193059, + "balance_loss_mlp": 1.00062871, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.80356768373477, + "language_loss": 0.80520874, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82743466, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.5642409324645996 + }, + { + "auxiliary_loss_clip": 0.01135824, + "auxiliary_loss_mlp": 0.01106211, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00054431, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.466005633319884, + "language_loss": 0.75070608, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77312642, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.5415024757385254 + }, + { + "auxiliary_loss_clip": 0.01134291, + "auxiliary_loss_mlp": 0.01105223, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00060487, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.5629160313293773, + "language_loss": 0.76922047, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79161561, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.5840280055999756 + }, + { + "auxiliary_loss_clip": 0.01119217, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00046718, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.271748445206609, + "language_loss": 0.78466392, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8033303, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.663755178451538 + }, + { + "auxiliary_loss_clip": 0.01102748, + "auxiliary_loss_mlp": 0.01106441, + "balance_loss_clip": 1.00164604, + "balance_loss_mlp": 1.0006783, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.5731996599914162, + "language_loss": 0.7939567, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81604862, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.6259377002716064 + }, + { + "auxiliary_loss_clip": 0.01165518, + "auxiliary_loss_mlp": 0.01105066, + "balance_loss_clip": 1.00187266, + "balance_loss_mlp": 1.00054312, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6277154686577628, + "language_loss": 0.68450487, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70721066, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.557520627975464 + }, + { + "auxiliary_loss_clip": 0.01134364, + "auxiliary_loss_mlp": 0.01106497, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00063896, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.918054184191222, + "language_loss": 0.63988036, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66228896, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.5979225635528564 + }, + { + "auxiliary_loss_clip": 0.0114901, + "auxiliary_loss_mlp": 0.01106094, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00042677, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.3957036845735344, + "language_loss": 0.52712035, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54967135, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 2.5933711528778076 + }, + { + "auxiliary_loss_clip": 0.01133252, + "auxiliary_loss_mlp": 0.00747567, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00053596, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.5537628148890945, + "language_loss": 0.75961381, + "learning_rate": 8.317197382644119e-07, + "loss": 0.778422, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.580538272857666 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01083317, + "balance_loss_clip": 1.00125778, + "balance_loss_mlp": 1.00015664, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8381326712907725, + "language_loss": 0.61973393, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64188838, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.090965986251831 + }, + { + "auxiliary_loss_clip": 0.01117731, + "auxiliary_loss_mlp": 0.01106435, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00067222, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.7320565383058661, + "language_loss": 0.76359475, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78583634, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.6427879333496094 + }, + { + "auxiliary_loss_clip": 0.01148789, + "auxiliary_loss_mlp": 0.0110454, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00049448, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.4811428455761764, + "language_loss": 0.71118927, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73372257, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.5348525047302246 + }, + { + "auxiliary_loss_clip": 0.01099433, + "auxiliary_loss_mlp": 0.01106258, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00059128, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 1.8991861858794512, + "language_loss": 0.69550586, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71756279, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.653223991394043 + }, + { + "auxiliary_loss_clip": 0.01149971, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00068951, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.5431881398195264, + "language_loss": 0.70318919, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72575057, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.5432372093200684 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01104813, + "balance_loss_clip": 1.00201094, + "balance_loss_mlp": 1.0006721, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.455741890082106, + "language_loss": 0.74622345, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76857209, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.5847485065460205 + }, + { + "auxiliary_loss_clip": 0.01115121, + "auxiliary_loss_mlp": 0.00747511, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00060582, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 2.066594943779157, + "language_loss": 0.86776215, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88638848, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.601527690887451 + }, + { + "auxiliary_loss_clip": 0.01151114, + "auxiliary_loss_mlp": 0.01105159, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00073159, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5425818362571906, + "language_loss": 0.75048566, + "learning_rate": 8.291922955383641e-07, + "loss": 0.7730484, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 2.5255379676818848 + }, + { + "auxiliary_loss_clip": 0.01134431, + "auxiliary_loss_mlp": 0.01107319, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00060344, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 3.1884940206937893, + "language_loss": 0.81925988, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84167743, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 3.9995718002319336 + }, + { + "auxiliary_loss_clip": 0.01133992, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00057554, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.4580107659770403, + "language_loss": 0.84606922, + "learning_rate": 8.285608785887673e-07, + "loss": 0.8684516, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.578160285949707 + }, + { + "auxiliary_loss_clip": 0.01134083, + "auxiliary_loss_mlp": 0.01105983, + "balance_loss_clip": 1.00174177, + "balance_loss_mlp": 1.00060236, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 3.009228749571619, + "language_loss": 0.7155301, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73793077, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.710475444793701 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01105817, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00053179, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4948681582923065, + "language_loss": 0.72810441, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75016606, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.6436822414398193 + }, + { + "auxiliary_loss_clip": 0.01149008, + "auxiliary_loss_mlp": 0.01105813, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00062299, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 4.547479207375649, + "language_loss": 0.7716651, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79421329, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.5044240951538086 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01105064, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00054133, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.5716894766201133, + "language_loss": 0.69502008, + "learning_rate": 8.272985778383828e-07, + "loss": 0.7173872, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.637735605239868 + }, + { + "auxiliary_loss_clip": 0.01103849, + "auxiliary_loss_mlp": 0.01105529, + "balance_loss_clip": 1.00167918, + "balance_loss_mlp": 1.00052953, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.576771096175536, + "language_loss": 0.78809577, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81018955, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.6418683528900146 + }, + { + "auxiliary_loss_clip": 0.01165716, + "auxiliary_loss_mlp": 0.01105591, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00059175, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.6503827500831119, + "language_loss": 0.77300394, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79571694, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.508737564086914 + }, + { + "auxiliary_loss_clip": 0.01134242, + "auxiliary_loss_mlp": 0.01105769, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00067425, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.5847277021625772, + "language_loss": 0.77706122, + "learning_rate": 8.26352319157738e-07, + "loss": 0.79946131, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.6051547527313232 + }, + { + "auxiliary_loss_clip": 0.01165805, + "auxiliary_loss_mlp": 0.01105866, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00048518, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 1.9184337591270002, + "language_loss": 0.79201251, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81472921, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 3.9526171684265137 + }, + { + "auxiliary_loss_clip": 0.01148905, + "auxiliary_loss_mlp": 0.01105376, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00056744, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 2.1726878120191233, + "language_loss": 0.76457465, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78711748, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 3.910745859146118 + }, + { + "auxiliary_loss_clip": 0.01119452, + "auxiliary_loss_mlp": 0.01107645, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00054705, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 1.8480019672862842, + "language_loss": 0.67861176, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70088267, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.5795559883117676 + }, + { + "auxiliary_loss_clip": 0.01082392, + "auxiliary_loss_mlp": 0.0110583, + "balance_loss_clip": 1.00157118, + "balance_loss_mlp": 1.00063968, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.5333012820996492, + "language_loss": 0.77150226, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79338449, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.7031660079956055 + }, + { + "auxiliary_loss_clip": 0.01133857, + "auxiliary_loss_mlp": 0.01106492, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00044322, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.077301685494245, + "language_loss": 0.71208525, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73448873, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.6339516639709473 + }, + { + "auxiliary_loss_clip": 0.01148942, + "auxiliary_loss_mlp": 0.01105558, + "balance_loss_clip": 1.00191462, + "balance_loss_mlp": 1.00055802, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 1.540982713521256, + "language_loss": 0.82215178, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84469676, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.5309548377990723 + }, + { + "auxiliary_loss_clip": 0.01114246, + "auxiliary_loss_mlp": 0.01106531, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00057817, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 2.5913052021936425, + "language_loss": 0.64567119, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66787899, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 2.6272451877593994 + }, + { + "auxiliary_loss_clip": 0.01150653, + "auxiliary_loss_mlp": 0.01105091, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00056791, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.700563296113053, + "language_loss": 0.7076748, + "learning_rate": 8.238309217655133e-07, + "loss": 0.73023218, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 3.9114573001861572 + }, + { + "auxiliary_loss_clip": 0.01134401, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00057578, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 5.184311257753803, + "language_loss": 0.76100457, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78340149, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.5505335330963135 + }, + { + "auxiliary_loss_clip": 0.01115994, + "auxiliary_loss_mlp": 0.01106029, + "balance_loss_clip": 1.00170732, + "balance_loss_mlp": 1.00064754, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.12099572590466, + "language_loss": 0.75422633, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77644658, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.5745747089385986 + }, + { + "auxiliary_loss_clip": 0.01150963, + "auxiliary_loss_mlp": 0.01107092, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00056648, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.5681632441712707, + "language_loss": 0.74235523, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76493579, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.539152145385742 + }, + { + "auxiliary_loss_clip": 0.01100774, + "auxiliary_loss_mlp": 0.01105834, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00054872, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.5707003885733806, + "language_loss": 0.7927165, + "learning_rate": 8.225712930977953e-07, + "loss": 0.8147825, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.667269468307495 + }, + { + "auxiliary_loss_clip": 0.0113633, + "auxiliary_loss_mlp": 0.01105685, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00068521, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.6989474125614972, + "language_loss": 0.66691703, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68933713, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.5776782035827637 + }, + { + "auxiliary_loss_clip": 0.01165736, + "auxiliary_loss_mlp": 0.01105013, + "balance_loss_clip": 1.00202072, + "balance_loss_mlp": 1.00049055, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.9388208304341568, + "language_loss": 0.81728828, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83999574, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.5349957942962646 + }, + { + "auxiliary_loss_clip": 0.01132206, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_clip": 1.00178611, + "balance_loss_mlp": 1.00048614, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8926880789455933, + "language_loss": 0.86378229, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88615066, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.5179693698883057 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.01105667, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00066757, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 2.0011611269472906, + "language_loss": 0.76176989, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78448331, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.4819839000701904 + }, + { + "auxiliary_loss_clip": 0.01151151, + "auxiliary_loss_mlp": 0.0110564, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00092602, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.6126822841507855, + "language_loss": 0.81691754, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83948541, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.538114547729492 + }, + { + "auxiliary_loss_clip": 0.0116567, + "auxiliary_loss_mlp": 0.01105915, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00053358, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.295484540938617, + "language_loss": 0.67537296, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69808882, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.4870455265045166 + }, + { + "auxiliary_loss_clip": 0.01150782, + "auxiliary_loss_mlp": 0.01104535, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00058484, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.7228569327401622, + "language_loss": 0.77984828, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80240142, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.581977128982544 + }, + { + "auxiliary_loss_clip": 0.01133629, + "auxiliary_loss_mlp": 0.00747584, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00061858, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 1.9332232142067367, + "language_loss": 0.78887773, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80768979, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.5645391941070557 + }, + { + "auxiliary_loss_clip": 0.01133901, + "auxiliary_loss_mlp": 0.01106184, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00061262, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.0049212468087205, + "language_loss": 0.56008881, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58248961, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.5725011825561523 + }, + { + "auxiliary_loss_clip": 0.01165732, + "auxiliary_loss_mlp": 0.01106587, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00063348, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 4.454938112211434, + "language_loss": 0.68863612, + "learning_rate": 8.194253484740882e-07, + "loss": 0.71135938, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.501190662384033 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01105702, + "balance_loss_clip": 1.0017252, + "balance_loss_mlp": 1.00041604, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.897229470225806, + "language_loss": 0.71431279, + "learning_rate": 8.191110000362513e-07, + "loss": 0.7368626, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.5344715118408203 + }, + { + "auxiliary_loss_clip": 0.0116128, + "auxiliary_loss_mlp": 0.01082778, + "balance_loss_clip": 1.00129271, + "balance_loss_mlp": 0.99999923, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.745347723927517, + "language_loss": 0.59418035, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61662102, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.1935744285583496 + }, + { + "auxiliary_loss_clip": 0.01070764, + "auxiliary_loss_mlp": 0.01105456, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00064707, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 2.576918471857538, + "language_loss": 0.74147093, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76323318, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.738497734069824 + }, + { + "auxiliary_loss_clip": 0.01115735, + "auxiliary_loss_mlp": 0.01104303, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00054359, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 4.449782900282667, + "language_loss": 0.83566141, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85786176, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.6345889568328857 + }, + { + "auxiliary_loss_clip": 0.01165724, + "auxiliary_loss_mlp": 0.01105981, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00050485, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 2.0007825957648455, + "language_loss": 0.70270443, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72542149, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.543363094329834 + }, + { + "auxiliary_loss_clip": 0.01165523, + "auxiliary_loss_mlp": 0.01105057, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00053406, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.705890266712151, + "language_loss": 0.81715757, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83986336, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.5094361305236816 + }, + { + "auxiliary_loss_clip": 0.01165666, + "auxiliary_loss_mlp": 0.0110551, + "balance_loss_clip": 1.00197279, + "balance_loss_mlp": 1.00051057, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.800940521270591, + "language_loss": 0.75605524, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77876699, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 2.4835143089294434 + }, + { + "auxiliary_loss_clip": 0.01101242, + "auxiliary_loss_mlp": 0.01104797, + "balance_loss_clip": 1.00165224, + "balance_loss_mlp": 1.00056064, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.7039706191461728, + "language_loss": 0.78505111, + "learning_rate": 8.16911815462725e-07, + "loss": 0.8071115, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 4.002630233764648 + }, + { + "auxiliary_loss_clip": 0.01134286, + "auxiliary_loss_mlp": 0.01104517, + "balance_loss_clip": 1.00163722, + "balance_loss_mlp": 1.00066137, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.7403406119291334, + "language_loss": 0.86603743, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88842547, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.5220940113067627 + }, + { + "auxiliary_loss_clip": 0.0110085, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00054848, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.128024580238703, + "language_loss": 0.84807116, + "learning_rate": 8.162838805998897e-07, + "loss": 0.87012368, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.621009588241577 + }, + { + "auxiliary_loss_clip": 0.01165532, + "auxiliary_loss_mlp": 0.01105604, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00060463, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.0911784071888815, + "language_loss": 0.75161028, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77432168, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.468566656112671 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01105702, + "balance_loss_clip": 1.00165951, + "balance_loss_mlp": 1.00041664, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.546093512683072, + "language_loss": 0.70821285, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7302922, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.6738603115081787 + }, + { + "auxiliary_loss_clip": 0.0114896, + "auxiliary_loss_mlp": 0.01105071, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.0005486, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.8376354797287546, + "language_loss": 0.7548511, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77739143, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.534518003463745 + }, + { + "auxiliary_loss_clip": 0.01095257, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_clip": 1.00129747, + "balance_loss_mlp": 1.00005424, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7650644344181305, + "language_loss": 0.55022609, + "learning_rate": 8.150285496090388e-07, + "loss": 0.572007, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.265188455581665 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01105233, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00070989, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 2.140902368713618, + "language_loss": 0.59950733, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62204683, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.5958337783813477 + }, + { + "auxiliary_loss_clip": 0.01148925, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 1.9667429335424778, + "language_loss": 0.71416819, + "learning_rate": 8.144011536714322e-07, + "loss": 0.736709, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 2.5263938903808594 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.00747396, + "balance_loss_clip": 1.00183213, + "balance_loss_mlp": 1.00049472, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.8017229214577, + "language_loss": 0.72736824, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74619901, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 4.209780931472778 + }, + { + "auxiliary_loss_clip": 0.01130493, + "auxiliary_loss_mlp": 0.0110566, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00056505, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.8898490841169044, + "language_loss": 0.79342419, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81578565, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 4.0718443393707275 + }, + { + "auxiliary_loss_clip": 0.01148864, + "auxiliary_loss_mlp": 0.01105586, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.0006814, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.507450911423091, + "language_loss": 0.83138007, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85392457, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.6773691177368164 + }, + { + "auxiliary_loss_clip": 0.01119041, + "auxiliary_loss_mlp": 0.01105917, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.00063145, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.3142079279700025, + "language_loss": 0.62297475, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64522433, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.730454444885254 + }, + { + "auxiliary_loss_clip": 0.01165615, + "auxiliary_loss_mlp": 0.01105195, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00057709, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.5140679553503225, + "language_loss": 0.71984327, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74255139, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.5548601150512695 + }, + { + "auxiliary_loss_clip": 0.01165645, + "auxiliary_loss_mlp": 0.0110506, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00063312, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.6685850908727846, + "language_loss": 0.80463743, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82734454, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.541717052459717 + }, + { + "auxiliary_loss_clip": 0.01150946, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00070143, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.621363595497736, + "language_loss": 0.83854473, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86110741, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.5408103466033936 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.011051, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00048161, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 1.8304923277946306, + "language_loss": 0.77551293, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79792464, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 4.0538177490234375 + }, + { + "auxiliary_loss_clip": 0.01144507, + "auxiliary_loss_mlp": 0.01082435, + "balance_loss_clip": 1.00121725, + "balance_loss_mlp": 1.00003803, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7493537563433765, + "language_loss": 0.56657588, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58884525, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 3.044238328933716 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01104689, + "balance_loss_clip": 1.0015595, + "balance_loss_mlp": 1.00064301, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.6991242729228595, + "language_loss": 0.70887959, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73093772, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.687077522277832 + }, + { + "auxiliary_loss_clip": 0.0116577, + "auxiliary_loss_mlp": 0.01105922, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.0004456, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 2.6937502643806805, + "language_loss": 0.79790908, + "learning_rate": 8.10953693063704e-07, + "loss": 0.82062596, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01148904, + "auxiliary_loss_mlp": 0.01104931, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00050402, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.6394534265432725, + "language_loss": 0.76053751, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78307581, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.59018611907959 + }, + { + "auxiliary_loss_clip": 0.01071592, + "auxiliary_loss_mlp": 0.0110491, + "balance_loss_clip": 1.00149417, + "balance_loss_mlp": 1.00057805, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.6972784936483036, + "language_loss": 0.70438659, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72615159, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.757642984390259 + }, + { + "auxiliary_loss_clip": 0.0114908, + "auxiliary_loss_mlp": 0.01105845, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.0006547, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.7340655043258946, + "language_loss": 0.61914331, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64169258, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.5769052505493164 + }, + { + "auxiliary_loss_clip": 0.01149139, + "auxiliary_loss_mlp": 0.01105529, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00052953, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.122728535332375, + "language_loss": 0.67931747, + "learning_rate": 8.097014228555426e-07, + "loss": 0.70186412, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.5879783630371094 + }, + { + "auxiliary_loss_clip": 0.01165629, + "auxiliary_loss_mlp": 0.0110629, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00081396, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 2.173688035261275, + "language_loss": 0.84206557, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86478478, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.502535820007324 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01105833, + "balance_loss_clip": 1.00190508, + "balance_loss_mlp": 1.00064301, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 1.6358178635006466, + "language_loss": 0.76717031, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78957027, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.5556252002716064 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00069702, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 1.8890630193550668, + "language_loss": 0.75264186, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77505553, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.577467918395996 + }, + { + "auxiliary_loss_clip": 0.01145502, + "auxiliary_loss_mlp": 0.01083764, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00022173, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.787179808009673, + "language_loss": 0.61591524, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63820785, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.09653639793396 + }, + { + "auxiliary_loss_clip": 0.01165698, + "auxiliary_loss_mlp": 0.01104695, + "balance_loss_clip": 1.00195658, + "balance_loss_mlp": 1.00055397, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.513274258132625, + "language_loss": 0.80330813, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82601202, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.5411901473999023 + }, + { + "auxiliary_loss_clip": 0.01104572, + "auxiliary_loss_mlp": 0.01104874, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00044692, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.198269948483959, + "language_loss": 0.79172087, + "learning_rate": 8.078243718677873e-07, + "loss": 0.8138153, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.642202377319336 + }, + { + "auxiliary_loss_clip": 0.0115098, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.0004977, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 2.25025491882857, + "language_loss": 0.77617526, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79873621, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.6083779335021973 + }, + { + "auxiliary_loss_clip": 0.01151103, + "auxiliary_loss_mlp": 0.01106171, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00059891, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.862336954309799, + "language_loss": 0.58611411, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60868686, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.536677122116089 + }, + { + "auxiliary_loss_clip": 0.01148318, + "auxiliary_loss_mlp": 0.00747328, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00048256, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.613026208638296, + "language_loss": 0.71325004, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73220652, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.572774648666382 + }, + { + "auxiliary_loss_clip": 0.01146762, + "auxiliary_loss_mlp": 0.01082784, + "balance_loss_clip": 1.00124431, + "balance_loss_mlp": 1.0000056, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8466771693418129, + "language_loss": 0.63040805, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65270352, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.121786117553711 + }, + { + "auxiliary_loss_clip": 0.01134373, + "auxiliary_loss_mlp": 0.01105462, + "balance_loss_clip": 1.00168204, + "balance_loss_mlp": 1.00065327, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.570114041187299, + "language_loss": 0.64246708, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66486543, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.01150886, + "auxiliary_loss_mlp": 0.01105348, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00044334, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.5014926678908118, + "language_loss": 0.69869012, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72125238, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.5937180519104004 + }, + { + "auxiliary_loss_clip": 0.01150035, + "auxiliary_loss_mlp": 0.01104534, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00058317, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.5826787848469757, + "language_loss": 0.83274639, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85529208, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.6088085174560547 + }, + { + "auxiliary_loss_clip": 0.01150914, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.00190866, + "balance_loss_mlp": 1.00062156, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.4906932900005803, + "language_loss": 0.72467232, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74365783, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 2.535722017288208 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01103686, + "balance_loss_clip": 1.00170338, + "balance_loss_mlp": 1.00059366, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 2.1549172947539934, + "language_loss": 0.92400408, + "learning_rate": 8.050118476867635e-07, + "loss": 0.9462105, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 4.084538459777832 + }, + { + "auxiliary_loss_clip": 0.01148964, + "auxiliary_loss_mlp": 0.01104365, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00060546, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.9470915173898768, + "language_loss": 0.79347074, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81600404, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.540891170501709 + }, + { + "auxiliary_loss_clip": 0.01103243, + "auxiliary_loss_mlp": 0.01106173, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00050581, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.7302271574921948, + "language_loss": 0.72781676, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74991095, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 2.6509690284729004 + }, + { + "auxiliary_loss_clip": 0.01149088, + "auxiliary_loss_mlp": 0.01105212, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00068903, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 2.144295477627769, + "language_loss": 0.70180434, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72434735, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.557770013809204 + }, + { + "auxiliary_loss_clip": 0.01150689, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00055456, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.000295397201121, + "language_loss": 0.85293508, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87548792, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.512784481048584 + }, + { + "auxiliary_loss_clip": 0.01165818, + "auxiliary_loss_mlp": 0.01105815, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00052977, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.7248812395821962, + "language_loss": 0.80164695, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82436329, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.466439962387085 + }, + { + "auxiliary_loss_clip": 0.01132231, + "auxiliary_loss_mlp": 0.01105078, + "balance_loss_clip": 1.00183558, + "balance_loss_mlp": 1.00055528, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.1960208531762853, + "language_loss": 0.69032532, + "learning_rate": 8.031388701659456e-07, + "loss": 0.7126984, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 2.8980014324188232 + }, + { + "auxiliary_loss_clip": 0.01148749, + "auxiliary_loss_mlp": 0.0110572, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00062537, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.8143513881495157, + "language_loss": 0.64472282, + "learning_rate": 8.028268660246023e-07, + "loss": 0.6672675, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.5179460048675537 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01106509, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00055611, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.702153740813634, + "language_loss": 0.67102242, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69342506, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.6055259704589844 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01104703, + "balance_loss_clip": 1.0019325, + "balance_loss_mlp": 1.00065744, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.0678764450302443, + "language_loss": 0.6672287, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68961728, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 5.470921993255615 + }, + { + "auxiliary_loss_clip": 0.01101086, + "auxiliary_loss_mlp": 0.01106192, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00052488, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 1.8773079013351337, + "language_loss": 0.65369427, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67576706, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.7147886753082275 + }, + { + "auxiliary_loss_clip": 0.01151007, + "auxiliary_loss_mlp": 0.01106148, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00067151, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.9613713426596635, + "language_loss": 0.8608855, + "learning_rate": 8.015793035467697e-07, + "loss": 0.88345701, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 2.5270512104034424 + }, + { + "auxiliary_loss_clip": 0.01119813, + "auxiliary_loss_mlp": 0.01104989, + "balance_loss_clip": 1.0018698, + "balance_loss_mlp": 1.00056124, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 2.541575195369056, + "language_loss": 0.75232232, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77457041, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.01116051, + "auxiliary_loss_mlp": 0.01106312, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00045431, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.235285210378394, + "language_loss": 0.69700855, + "learning_rate": 8.009557949259464e-07, + "loss": 0.7192322, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.654726266860962 + }, + { + "auxiliary_loss_clip": 0.01148783, + "auxiliary_loss_mlp": 0.01104793, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00055611, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 2.0134705511244126, + "language_loss": 0.7166664, + "learning_rate": 8.006441088114397e-07, + "loss": 0.7392022, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.5507469177246094 + }, + { + "auxiliary_loss_clip": 0.01100214, + "auxiliary_loss_mlp": 0.01106207, + "balance_loss_clip": 1.00187767, + "balance_loss_mlp": 1.00034916, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.0508289015211743, + "language_loss": 0.65684307, + "learning_rate": 8.003324681766286e-07, + "loss": 0.67890728, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 4.081485748291016 + }, + { + "auxiliary_loss_clip": 0.01134029, + "auxiliary_loss_mlp": 0.01105197, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.0003885, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.6715234060975936, + "language_loss": 0.77819771, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80058998, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.619762897491455 + }, + { + "auxiliary_loss_clip": 0.01102505, + "auxiliary_loss_mlp": 0.01105602, + "balance_loss_clip": 1.0019604, + "balance_loss_mlp": 1.000507, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.7069533408628483, + "language_loss": 0.81008017, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83216125, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.6959140300750732 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.0007031, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 4.23912513667406, + "language_loss": 0.79276121, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81499141, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.6162428855895996 + }, + { + "auxiliary_loss_clip": 0.01148898, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.0004741, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.342363388539885, + "language_loss": 0.84013969, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86269009, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.5374844074249268 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00052571, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 2.0393727008703375, + "language_loss": 0.85998029, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88220203, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.6003518104553223 + }, + { + "auxiliary_loss_clip": 0.01150386, + "auxiliary_loss_mlp": 0.01106207, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00053978, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 2.0167063274121446, + "language_loss": 0.82958341, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85214925, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.527195930480957 + }, + { + "auxiliary_loss_clip": 0.01134204, + "auxiliary_loss_mlp": 0.01106645, + "balance_loss_clip": 1.00182569, + "balance_loss_mlp": 1.00059628, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.6761571893828624, + "language_loss": 0.69557774, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71798623, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.588942527770996 + }, + { + "auxiliary_loss_clip": 0.0116569, + "auxiliary_loss_mlp": 0.01105467, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00056243, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 7.168445413121215, + "language_loss": 0.77512604, + "learning_rate": 7.978409817849079e-07, + "loss": 0.79783762, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.46691632270813 + }, + { + "auxiliary_loss_clip": 0.01148867, + "auxiliary_loss_mlp": 0.01104993, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 1.8043199998658024, + "language_loss": 0.69401228, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71655095, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.5395638942718506 + }, + { + "auxiliary_loss_clip": 0.01114853, + "auxiliary_loss_mlp": 0.01104543, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00059235, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 3.9521882608895313, + "language_loss": 0.67580587, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69799978, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.635221004486084 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01106174, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.941277545341295, + "language_loss": 0.69459164, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71664095, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.6501123905181885 + }, + { + "auxiliary_loss_clip": 0.01134298, + "auxiliary_loss_mlp": 0.01105892, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00060618, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.0813329221255064, + "language_loss": 0.80433893, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82674086, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01115453, + "auxiliary_loss_mlp": 0.01106097, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.0006206, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.9383813931607012, + "language_loss": 0.63568223, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65789771, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.6581404209136963 + }, + { + "auxiliary_loss_clip": 0.01165747, + "auxiliary_loss_mlp": 0.01105482, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00048256, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.6889239324668355, + "language_loss": 0.68498594, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70769823, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.4828546047210693 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.01106392, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00063002, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.8910218839629698, + "language_loss": 0.77830511, + "learning_rate": 7.956633242496788e-07, + "loss": 0.80087692, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 2.532773494720459 + }, + { + "auxiliary_loss_clip": 0.01151032, + "auxiliary_loss_mlp": 0.01105861, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00047946, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.0969710618569994, + "language_loss": 0.73890781, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76147676, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.5525927543640137 + }, + { + "auxiliary_loss_clip": 0.01130026, + "auxiliary_loss_mlp": 0.01082766, + "balance_loss_clip": 1.00135326, + "balance_loss_mlp": 0.99998742, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8940428360101755, + "language_loss": 0.66373086, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68585873, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.1275532245635986 + }, + { + "auxiliary_loss_clip": 0.0109984, + "auxiliary_loss_mlp": 0.01104463, + "balance_loss_clip": 1.00166774, + "balance_loss_mlp": 1.00041699, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.796449091051302, + "language_loss": 0.75196135, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77400446, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.6425516605377197 + }, + { + "auxiliary_loss_clip": 0.01148361, + "auxiliary_loss_mlp": 0.01105336, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.5870333536384877, + "language_loss": 0.71754724, + "learning_rate": 7.944199529642372e-07, + "loss": 0.74008423, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.533698320388794 + }, + { + "auxiliary_loss_clip": 0.01151054, + "auxiliary_loss_mlp": 0.01105581, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00067723, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.8002507868060227, + "language_loss": 0.84257787, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86514425, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.5516350269317627 + }, + { + "auxiliary_loss_clip": 0.01100714, + "auxiliary_loss_mlp": 0.01105681, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00049102, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.736480874748367, + "language_loss": 0.76072788, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78279185, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.6587514877319336 + }, + { + "auxiliary_loss_clip": 0.01119122, + "auxiliary_loss_mlp": 0.01105515, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00061059, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.6089965612583417, + "language_loss": 0.7394979, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76174426, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 4.100113391876221 + }, + { + "auxiliary_loss_clip": 0.01102771, + "auxiliary_loss_mlp": 0.0110608, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00060344, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.100595849564166, + "language_loss": 0.676898, + "learning_rate": 7.931773131302211e-07, + "loss": 0.69898647, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.6940934658050537 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01106916, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00058126, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.236073529603145, + "language_loss": 0.74000633, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76223588, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 2.6330759525299072 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01106561, + "balance_loss_clip": 1.00211835, + "balance_loss_mlp": 1.00060797, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.3531959417048487, + "language_loss": 0.66235554, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68508077, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.4946839809417725 + }, + { + "auxiliary_loss_clip": 0.01117907, + "auxiliary_loss_mlp": 0.01105816, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00053036, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.5372478792168691, + "language_loss": 0.77482891, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79706609, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.6540424823760986 + }, + { + "auxiliary_loss_clip": 0.0114899, + "auxiliary_loss_mlp": 0.01105569, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00056934, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 2.2319969748643, + "language_loss": 0.69383579, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71638143, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.524369955062866 + }, + { + "auxiliary_loss_clip": 0.01132124, + "auxiliary_loss_mlp": 0.01105987, + "balance_loss_clip": 1.00164962, + "balance_loss_mlp": 1.00070167, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.0958443274085212, + "language_loss": 0.86237824, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88475931, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.5867178440093994 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01104714, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.7945821047614383, + "language_loss": 0.78171468, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80409467, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.5729150772094727 + }, + { + "auxiliary_loss_clip": 0.01134334, + "auxiliary_loss_mlp": 0.01106515, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.7745000774620447, + "language_loss": 0.72999418, + "learning_rate": 7.910044557431302e-07, + "loss": 0.7524026, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.631922721862793 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.01105636, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.0006361, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 4.537383493520221, + "language_loss": 0.76137507, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78393942, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 3.934988498687744 + }, + { + "auxiliary_loss_clip": 0.01149251, + "auxiliary_loss_mlp": 0.01105775, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00058508, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 2.146177291491717, + "language_loss": 0.80965203, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83220232, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 3.9253573417663574 + }, + { + "auxiliary_loss_clip": 0.01118663, + "auxiliary_loss_mlp": 0.01106268, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00050521, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 1.7369546144818289, + "language_loss": 0.81358898, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83583826, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.5852601528167725 + }, + { + "auxiliary_loss_clip": 0.01117244, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00040531, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.7275702564619089, + "language_loss": 0.68084353, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70307863, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.5905988216400146 + }, + { + "auxiliary_loss_clip": 0.01117775, + "auxiliary_loss_mlp": 0.01104667, + "balance_loss_clip": 1.00161219, + "balance_loss_mlp": 1.00052595, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.9683847242409775, + "language_loss": 0.7583189, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78054333, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 2.6194565296173096 + }, + { + "auxiliary_loss_clip": 0.01132186, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00057447, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.189117726271506, + "language_loss": 0.72276068, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74514109, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.5561716556549072 + }, + { + "auxiliary_loss_clip": 0.01115437, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.00062585, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.6549145095290412, + "language_loss": 0.77850765, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80071354, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.624688148498535 + }, + { + "auxiliary_loss_clip": 0.01128049, + "auxiliary_loss_mlp": 0.0108276, + "balance_loss_clip": 1.0014416, + "balance_loss_mlp": 0.9999814, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7816064247623871, + "language_loss": 0.55284119, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57494926, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 4.476884841918945 + }, + { + "auxiliary_loss_clip": 0.01151108, + "auxiliary_loss_mlp": 0.01105483, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00067401, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.9231048753304967, + "language_loss": 0.69547641, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71804237, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.5330746173858643 + }, + { + "auxiliary_loss_clip": 0.01102876, + "auxiliary_loss_mlp": 0.0110608, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00060368, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.6929208577376187, + "language_loss": 0.71304691, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73513645, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.6667962074279785 + }, + { + "auxiliary_loss_clip": 0.01148975, + "auxiliary_loss_mlp": 0.01105825, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00053954, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.465727024224581, + "language_loss": 0.75004852, + "learning_rate": 7.875945057930144e-07, + "loss": 0.7725966, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 2.6236963272094727 + }, + { + "auxiliary_loss_clip": 0.01131932, + "auxiliary_loss_mlp": 0.01105849, + "balance_loss_clip": 1.00167298, + "balance_loss_mlp": 1.00056291, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.932244946574652, + "language_loss": 0.76786232, + "learning_rate": 7.872847859552251e-07, + "loss": 0.79024005, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.6379950046539307 + }, + { + "auxiliary_loss_clip": 0.01115813, + "auxiliary_loss_mlp": 0.01106412, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.0004586, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.8918396615141926, + "language_loss": 0.58676565, + "learning_rate": 7.869751121037192e-07, + "loss": 0.60898793, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 2.9843008518218994 + }, + { + "auxiliary_loss_clip": 0.0114905, + "auxiliary_loss_mlp": 0.01105949, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.00056756, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 1.610845031928508, + "language_loss": 0.78373492, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80628496, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.59649920463562 + }, + { + "auxiliary_loss_clip": 0.01131966, + "auxiliary_loss_mlp": 0.01104066, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.9126718938035299, + "language_loss": 0.74221826, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76457858, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.6019198894500732 + }, + { + "auxiliary_loss_clip": 0.0111702, + "auxiliary_loss_mlp": 0.01105246, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00062799, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.5998128264513372, + "language_loss": 0.74101615, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76323879, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.613990068435669 + }, + { + "auxiliary_loss_clip": 0.0116561, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00058782, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.8142126270620118, + "language_loss": 0.81035006, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83305728, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.499234914779663 + }, + { + "auxiliary_loss_clip": 0.01069399, + "auxiliary_loss_mlp": 0.01105916, + "balance_loss_clip": 1.00156605, + "balance_loss_mlp": 1.00053525, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.703197134731632, + "language_loss": 0.68340635, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70515949, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 2.720745086669922 + }, + { + "auxiliary_loss_clip": 0.0113459, + "auxiliary_loss_mlp": 0.01105332, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00052273, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 2.737403963192676, + "language_loss": 0.75872791, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78112715, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.5751469135284424 + }, + { + "auxiliary_loss_clip": 0.0112819, + "auxiliary_loss_mlp": 0.01082818, + "balance_loss_clip": 1.00129235, + "balance_loss_mlp": 1.00003946, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6284158574206339, + "language_loss": 0.5395565, + "learning_rate": 7.848086837452639e-07, + "loss": 0.56166661, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.192131757736206 + }, + { + "auxiliary_loss_clip": 0.01132562, + "auxiliary_loss_mlp": 0.01106527, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00057364, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 2.0090385331189515, + "language_loss": 0.69013631, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71252716, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.613462448120117 + }, + { + "auxiliary_loss_clip": 0.0113564, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00059474, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.8197746537247004, + "language_loss": 0.74838483, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77079237, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.659118890762329 + }, + { + "auxiliary_loss_clip": 0.01113919, + "auxiliary_loss_mlp": 0.01106901, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00056612, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 3.0202647080170224, + "language_loss": 0.75725591, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77946413, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.590482711791992 + }, + { + "auxiliary_loss_clip": 0.01132141, + "auxiliary_loss_mlp": 0.01082753, + "balance_loss_clip": 1.00133502, + "balance_loss_mlp": 0.99997371, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.8222958937296674, + "language_loss": 0.55168271, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57383168, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 2.9554715156555176 + }, + { + "auxiliary_loss_clip": 0.01117544, + "auxiliary_loss_mlp": 0.01105995, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00061369, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.4587214340467907, + "language_loss": 0.76836193, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79059732, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.6586899757385254 + }, + { + "auxiliary_loss_clip": 0.01115379, + "auxiliary_loss_mlp": 0.01104235, + "balance_loss_clip": 1.00177586, + "balance_loss_mlp": 1.0006659, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.5367708027632783, + "language_loss": 0.67570961, + "learning_rate": 7.829535421264588e-07, + "loss": 0.69790578, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.6131205558776855 + }, + { + "auxiliary_loss_clip": 0.01132222, + "auxiliary_loss_mlp": 0.01104197, + "balance_loss_clip": 1.00175357, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.5521447561479425, + "language_loss": 0.77620625, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79857039, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.6166234016418457 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.0074762, + "balance_loss_clip": 1.00166082, + "balance_loss_mlp": 1.00053668, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 3.570525524538731, + "language_loss": 0.77141416, + "learning_rate": 7.823355306660093e-07, + "loss": 0.7903825, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.5512425899505615 + }, + { + "auxiliary_loss_clip": 0.01150944, + "auxiliary_loss_mlp": 0.01104377, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00042641, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.6172347872883126, + "language_loss": 0.68965691, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71221018, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.546450614929199 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01104531, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00058031, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.6598731865203158, + "language_loss": 0.6513564, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67343497, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.6843607425689697 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01105981, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00059974, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.7740198641757734, + "language_loss": 0.6937601, + "learning_rate": 7.81408859809308e-07, + "loss": 0.7161845, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 4.013494491577148 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01105653, + "balance_loss_clip": 1.0017339, + "balance_loss_mlp": 1.00046277, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.6784611255248245, + "language_loss": 0.80348992, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82571733, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 2.611743450164795 + }, + { + "auxiliary_loss_clip": 0.01149137, + "auxiliary_loss_mlp": 0.01104665, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00061929, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.1460087478717, + "language_loss": 0.78230298, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80484104, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.517361879348755 + }, + { + "auxiliary_loss_clip": 0.01149064, + "auxiliary_loss_mlp": 0.01104732, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00040054, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.3930565820192546, + "language_loss": 0.75066406, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77320206, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.568976879119873 + }, + { + "auxiliary_loss_clip": 0.01165866, + "auxiliary_loss_mlp": 0.01107351, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00053918, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.781102689017002, + "language_loss": 0.69117725, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71390939, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.4558818340301514 + }, + { + "auxiliary_loss_clip": 0.01150302, + "auxiliary_loss_mlp": 0.01105935, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00045824, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 1.87919159801296, + "language_loss": 0.86223733, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88479972, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.5516350269317627 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01105835, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00054908, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5429780731345115, + "language_loss": 0.74102354, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76325613, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.7681849002838135 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01082763, + "balance_loss_clip": 1.00129437, + "balance_loss_mlp": 0.99998373, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7560544052033826, + "language_loss": 0.55872917, + "learning_rate": 7.79248245675082e-07, + "loss": 0.58116943, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 3.070277214050293 + }, + { + "auxiliary_loss_clip": 0.01149094, + "auxiliary_loss_mlp": 0.01105901, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00071061, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.9185398369679496, + "language_loss": 0.5475803, + "learning_rate": 7.789397715835542e-07, + "loss": 0.57013023, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 5.406539440155029 + }, + { + "auxiliary_loss_clip": 0.01148823, + "auxiliary_loss_mlp": 0.01104128, + "balance_loss_clip": 1.00187206, + "balance_loss_mlp": 1.00046349, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.7636386987594854, + "language_loss": 0.76498139, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78751087, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.537520408630371 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_clip": 1.00134993, + "balance_loss_mlp": 0.99999696, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7535219374360831, + "language_loss": 0.61360013, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63572478, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.1072001457214355 + }, + { + "auxiliary_loss_clip": 0.01117356, + "auxiliary_loss_mlp": 0.01104996, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00047314, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.52169644534568, + "language_loss": 0.58859456, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61081815, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.6754438877105713 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.7912893895111692, + "language_loss": 0.79156935, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81396019, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 2.60121750831604 + }, + { + "auxiliary_loss_clip": 0.01148963, + "auxiliary_loss_mlp": 0.01106094, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00071311, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.672274179754515, + "language_loss": 0.66006786, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68261838, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 2.5514190196990967 + }, + { + "auxiliary_loss_clip": 0.01165543, + "auxiliary_loss_mlp": 0.01105533, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00053334, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.7622696476782687, + "language_loss": 0.78662032, + "learning_rate": 7.770898998009254e-07, + "loss": 0.80933106, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 3.880232095718384 + }, + { + "auxiliary_loss_clip": 0.01130325, + "auxiliary_loss_mlp": 0.00747601, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00059199, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 1.8722102297437417, + "language_loss": 0.6284017, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64718103, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 2.544185161590576 + }, + { + "auxiliary_loss_clip": 0.01145457, + "auxiliary_loss_mlp": 0.01082397, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 0.99999923, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7153191360767742, + "language_loss": 0.51105076, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53332931, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.055347204208374 + }, + { + "auxiliary_loss_clip": 0.01117976, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_clip": 1.00182152, + "balance_loss_mlp": 1.00051856, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 2.368327825097348, + "language_loss": 0.74427748, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76652294, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.6324782371520996 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.00161433, + "balance_loss_mlp": 1.00053453, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.4243768594348607, + "language_loss": 0.72470522, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74317968, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.6492667198181152 + }, + { + "auxiliary_loss_clip": 0.01136139, + "auxiliary_loss_mlp": 0.01106147, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00057471, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5434961829941438, + "language_loss": 0.71566737, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73809022, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.5916683673858643 + }, + { + "auxiliary_loss_clip": 0.01165679, + "auxiliary_loss_mlp": 0.00747451, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00044537, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.6389424412303353, + "language_loss": 0.76259232, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78172362, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.572572708129883 + }, + { + "auxiliary_loss_clip": 0.01165705, + "auxiliary_loss_mlp": 0.01106116, + "balance_loss_clip": 1.00202382, + "balance_loss_mlp": 1.00054479, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 3.7321722224697917, + "language_loss": 0.6707356, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69345379, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.4707422256469727 + }, + { + "auxiliary_loss_clip": 0.01134347, + "auxiliary_loss_mlp": 0.01106281, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.00051808, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.9148797506382167, + "language_loss": 0.7834456, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80585182, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.607968807220459 + }, + { + "auxiliary_loss_clip": 0.01151036, + "auxiliary_loss_mlp": 0.01106351, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00049353, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 2.073009122392756, + "language_loss": 0.75018394, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77275783, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.5782971382141113 + }, + { + "auxiliary_loss_clip": 0.01150438, + "auxiliary_loss_mlp": 0.01106118, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00045097, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.6682476440251772, + "language_loss": 0.72805786, + "learning_rate": 7.740104912387164e-07, + "loss": 0.7506234, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.55832839012146 + }, + { + "auxiliary_loss_clip": 0.01132246, + "auxiliary_loss_mlp": 0.01105881, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00069022, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.6565116909154027, + "language_loss": 0.74577582, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76815701, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.584420680999756 + }, + { + "auxiliary_loss_clip": 0.01119634, + "auxiliary_loss_mlp": 0.01105692, + "balance_loss_clip": 1.00172329, + "balance_loss_mlp": 1.00050223, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.504093910925483, + "language_loss": 0.73334134, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75559461, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.6874125003814697 + }, + { + "auxiliary_loss_clip": 0.01071019, + "auxiliary_loss_mlp": 0.01106389, + "balance_loss_clip": 1.00170624, + "balance_loss_mlp": 1.0005312, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.6500382429645897, + "language_loss": 0.7088989, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73067296, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.760272979736328 + }, + { + "auxiliary_loss_clip": 0.01087094, + "auxiliary_loss_mlp": 0.01105457, + "balance_loss_clip": 1.00150943, + "balance_loss_mlp": 1.00055242, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.7314269316244701, + "language_loss": 0.73288834, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75481385, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.7526674270629883 + }, + { + "auxiliary_loss_clip": 0.0115077, + "auxiliary_loss_mlp": 0.01104958, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00062561, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.7637226651047933, + "language_loss": 0.83837843, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86093569, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.56183123588562 + }, + { + "auxiliary_loss_clip": 0.01165686, + "auxiliary_loss_mlp": 0.011061, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.0006237, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.735778161145845, + "language_loss": 0.8177377, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84045553, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.5262057781219482 + }, + { + "auxiliary_loss_clip": 0.01134445, + "auxiliary_loss_mlp": 0.01105468, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00065911, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.6878873096780396, + "language_loss": 0.77858186, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80098099, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.626696825027466 + }, + { + "auxiliary_loss_clip": 0.01148701, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_clip": 1.00194728, + "balance_loss_mlp": 1.00072765, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.348242646938298, + "language_loss": 0.74754953, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77009189, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.5746233463287354 + }, + { + "auxiliary_loss_clip": 0.01148413, + "auxiliary_loss_mlp": 0.01105558, + "balance_loss_clip": 1.00182986, + "balance_loss_mlp": 1.00046301, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.609569359187608, + "language_loss": 0.75099051, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77353024, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.550081729888916 + }, + { + "auxiliary_loss_clip": 0.01113882, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.312826984120882, + "language_loss": 0.80155313, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82376271, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.6225709915161133 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00058162, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.7921058772280691, + "language_loss": 0.74714386, + "learning_rate": 7.70628511821652e-07, + "loss": 0.7696861, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.5251107215881348 + }, + { + "auxiliary_loss_clip": 0.01132773, + "auxiliary_loss_mlp": 0.0110569, + "balance_loss_clip": 1.00199997, + "balance_loss_mlp": 1.0005002, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.51122154937252, + "language_loss": 0.77229238, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79467702, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.6541240215301514 + }, + { + "auxiliary_loss_clip": 0.01135939, + "auxiliary_loss_mlp": 0.01105205, + "balance_loss_clip": 1.00180805, + "balance_loss_mlp": 1.0004915, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.7710159354160822, + "language_loss": 0.73272842, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75513989, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.5759499073028564 + }, + { + "auxiliary_loss_clip": 0.01134103, + "auxiliary_loss_mlp": 0.01104241, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.0006721, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 2.1304758964632793, + "language_loss": 0.81735206, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83973551, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 3.9732556343078613 + }, + { + "auxiliary_loss_clip": 0.01133142, + "auxiliary_loss_mlp": 0.01105023, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00049996, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 1.6575790271856414, + "language_loss": 0.76402724, + "learning_rate": 7.69400098845407e-07, + "loss": 0.7864089, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.609769821166992 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.0110542, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00042057, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.4887134770064996, + "language_loss": 0.71010697, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73235995, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.616278886795044 + }, + { + "auxiliary_loss_clip": 0.01130097, + "auxiliary_loss_mlp": 0.01082853, + "balance_loss_clip": 1.00140905, + "balance_loss_mlp": 1.00007427, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9164682681186105, + "language_loss": 0.60859859, + "learning_rate": 7.68786172297538e-07, + "loss": 0.63072813, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.060540199279785 + }, + { + "auxiliary_loss_clip": 0.01165755, + "auxiliary_loss_mlp": 0.01106441, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.0004878, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.0265976590664216, + "language_loss": 0.79929566, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82201761, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.5146968364715576 + }, + { + "auxiliary_loss_clip": 0.01136068, + "auxiliary_loss_mlp": 0.01106289, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00062168, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.7143701194042258, + "language_loss": 0.75310344, + "learning_rate": 7.681724325006733e-07, + "loss": 0.775527, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.6175835132598877 + }, + { + "auxiliary_loss_clip": 0.01098343, + "auxiliary_loss_mlp": 0.01083283, + "balance_loss_clip": 1.00129056, + "balance_loss_mlp": 1.00012267, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8546270682434095, + "language_loss": 0.57191217, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59372842, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 3.047487258911133 + }, + { + "auxiliary_loss_clip": 0.01132576, + "auxiliary_loss_mlp": 0.01105998, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 1.9453517546273285, + "language_loss": 0.61169744, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63408327, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.656792640686035 + }, + { + "auxiliary_loss_clip": 0.01150777, + "auxiliary_loss_mlp": 0.01104742, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00060046, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.740600424790607, + "language_loss": 0.67620039, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69875556, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 5.451359748840332 + }, + { + "auxiliary_loss_clip": 0.01119044, + "auxiliary_loss_mlp": 0.01105175, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.0004611, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.801838614125707, + "language_loss": 0.67158896, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69383115, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.6267917156219482 + }, + { + "auxiliary_loss_clip": 0.01134042, + "auxiliary_loss_mlp": 0.01105806, + "balance_loss_clip": 1.00187361, + "balance_loss_mlp": 1.00052071, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.5082406756759845, + "language_loss": 0.75756466, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77996314, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.6598751544952393 + }, + { + "auxiliary_loss_clip": 0.01165492, + "auxiliary_loss_mlp": 0.01104929, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00050151, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 1.9127461645582542, + "language_loss": 0.7897197, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81242394, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.5410139560699463 + }, + { + "auxiliary_loss_clip": 0.01151082, + "auxiliary_loss_mlp": 0.01105834, + "balance_loss_clip": 1.00199997, + "balance_loss_mlp": 1.00054836, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.6605860990576076, + "language_loss": 0.64697456, + "learning_rate": 7.660258152195767e-07, + "loss": 0.66954374, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 2.6162538528442383 + }, + { + "auxiliary_loss_clip": 0.01150844, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00059366, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.85205717638477, + "language_loss": 0.66793895, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69050241, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.5907137393951416 + }, + { + "auxiliary_loss_clip": 0.01132325, + "auxiliary_loss_mlp": 0.01105933, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.741943620694026, + "language_loss": 0.73516917, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75755173, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 3.99053692817688 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00044179, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.7736601449472147, + "language_loss": 0.663867, + "learning_rate": 7.65106538038665e-07, + "loss": 0.6826874, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 2.55975079536438 + }, + { + "auxiliary_loss_clip": 0.01132727, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.00164032, + "balance_loss_mlp": 1.00060701, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.4747864390334242, + "language_loss": 0.66251779, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68490493, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.585174798965454 + }, + { + "auxiliary_loss_clip": 0.01151079, + "auxiliary_loss_mlp": 0.01106453, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.00059545, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.5765819160084042, + "language_loss": 0.73934197, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76191723, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.5322048664093018 + }, + { + "auxiliary_loss_clip": 0.01148993, + "auxiliary_loss_mlp": 0.01104563, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00051665, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.6221956432659268, + "language_loss": 0.62766373, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65019941, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.5843842029571533 + }, + { + "auxiliary_loss_clip": 0.01134272, + "auxiliary_loss_mlp": 0.01106134, + "balance_loss_clip": 1.00200164, + "balance_loss_mlp": 1.00056243, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.6916015911419542, + "language_loss": 0.72681653, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74922061, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.5763018131256104 + }, + { + "auxiliary_loss_clip": 0.01131714, + "auxiliary_loss_mlp": 0.0110563, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00053477, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.7117632913340366, + "language_loss": 0.78622782, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80860126, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.5708580017089844 + }, + { + "auxiliary_loss_clip": 0.01165643, + "auxiliary_loss_mlp": 0.0110537, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00056076, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 1.8832498237269388, + "language_loss": 0.78813493, + "learning_rate": 7.632692483270618e-07, + "loss": 0.81084508, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 2.526592254638672 + }, + { + "auxiliary_loss_clip": 0.01165428, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00046551, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.633611707791364, + "language_loss": 0.82362306, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84632528, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.4882500171661377 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01105373, + "balance_loss_clip": 1.00191367, + "balance_loss_mlp": 1.00056398, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 1.9030887453405503, + "language_loss": 0.76589, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78824329, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.5335800647735596 + }, + { + "auxiliary_loss_clip": 0.01117433, + "auxiliary_loss_mlp": 0.01104655, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00051427, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.4279059233898208, + "language_loss": 0.72517633, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74739724, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.6820859909057617 + }, + { + "auxiliary_loss_clip": 0.01150506, + "auxiliary_loss_mlp": 0.0110614, + "balance_loss_clip": 1.00189722, + "balance_loss_mlp": 1.00047302, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 2.0337647829153904, + "language_loss": 0.66303444, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68560088, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.569272041320801 + }, + { + "auxiliary_loss_clip": 0.01150438, + "auxiliary_loss_mlp": 0.01105737, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00054634, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.1767463743965636, + "language_loss": 0.65934497, + "learning_rate": 7.61739463127115e-07, + "loss": 0.6819067, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.5716633796691895 + }, + { + "auxiliary_loss_clip": 0.01151213, + "auxiliary_loss_mlp": 0.01106423, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00056529, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.676071419897328, + "language_loss": 0.66941381, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69199014, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.515784978866577 + }, + { + "auxiliary_loss_clip": 0.01133593, + "auxiliary_loss_mlp": 0.01105815, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00052917, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 9.083061449612128, + "language_loss": 0.79414707, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81654119, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.6062841415405273 + }, + { + "auxiliary_loss_clip": 0.01165711, + "auxiliary_loss_mlp": 0.01106055, + "balance_loss_clip": 1.00203884, + "balance_loss_mlp": 1.00048304, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 2.93936247833083, + "language_loss": 0.81182373, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83454132, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.55133318901062 + }, + { + "auxiliary_loss_clip": 0.01165765, + "auxiliary_loss_mlp": 0.01105802, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00061202, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1.9243654411551898, + "language_loss": 0.66883445, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69155014, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.5056965351104736 + }, + { + "auxiliary_loss_clip": 0.0116573, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00062609, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.9701635968268738, + "language_loss": 0.72400707, + "learning_rate": 7.602108518011696e-07, + "loss": 0.7467159, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.485283613204956 + }, + { + "auxiliary_loss_clip": 0.01133708, + "auxiliary_loss_mlp": 0.011052, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.00048649, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.182053620071615, + "language_loss": 0.82911456, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85150373, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.5604240894317627 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01105993, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00070715, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.8801601644294248, + "language_loss": 0.76882631, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79137611, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.523515462875366 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.0110513, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00070214, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.6175835987346954, + "language_loss": 0.81606567, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83862734, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.5526793003082275 + }, + { + "auxiliary_loss_clip": 0.01148926, + "auxiliary_loss_mlp": 0.01106131, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00046396, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.8537976873969604, + "language_loss": 0.62579083, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64834142, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.6972198486328125 + }, + { + "auxiliary_loss_clip": 0.01165904, + "auxiliary_loss_mlp": 0.011073, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00067902, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.13040398485999, + "language_loss": 0.68257976, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70531178, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.49210262298584 + }, + { + "auxiliary_loss_clip": 0.01129098, + "auxiliary_loss_mlp": 0.01083504, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00034392, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.882763951624534, + "language_loss": 0.54134405, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56347001, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 4.525131702423096 + }, + { + "auxiliary_loss_clip": 0.01116496, + "auxiliary_loss_mlp": 0.01105585, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00048995, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4976850599691554, + "language_loss": 0.63594532, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65816617, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.814225912094116 + }, + { + "auxiliary_loss_clip": 0.01134501, + "auxiliary_loss_mlp": 0.0110484, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00060344, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.7820149695151384, + "language_loss": 0.91634083, + "learning_rate": 7.577675189541865e-07, + "loss": 0.93873423, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.5919573307037354 + }, + { + "auxiliary_loss_clip": 0.01119182, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.000633, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.9406827828584405, + "language_loss": 0.64006317, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66232467, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 2.5926871299743652 + }, + { + "auxiliary_loss_clip": 0.01148454, + "auxiliary_loss_mlp": 0.01106031, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.0005542, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.2442227461611246, + "language_loss": 0.7835598, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80610472, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.5629799365997314 + }, + { + "auxiliary_loss_clip": 0.01149038, + "auxiliary_loss_mlp": 0.01106781, + "balance_loss_clip": 1.00198054, + "balance_loss_mlp": 1.00063741, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.685031900896428, + "language_loss": 0.63369417, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65625238, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.5934555530548096 + }, + { + "auxiliary_loss_clip": 0.01165765, + "auxiliary_loss_mlp": 0.01106231, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.0005641, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 1.7892984355366506, + "language_loss": 0.77077514, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79349512, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.550899028778076 + }, + { + "auxiliary_loss_clip": 0.01151295, + "auxiliary_loss_mlp": 0.01105364, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00065041, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.6458634467923405, + "language_loss": 0.79260468, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81517124, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.560347080230713 + }, + { + "auxiliary_loss_clip": 0.01132138, + "auxiliary_loss_mlp": 0.01104804, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00047195, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7753195032500146, + "language_loss": 0.75488412, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77725357, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.581341505050659 + }, + { + "auxiliary_loss_clip": 0.01165608, + "auxiliary_loss_mlp": 0.01106074, + "balance_loss_clip": 1.002056, + "balance_loss_mlp": 1.0004065, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.5129723138889877, + "language_loss": 0.75972116, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78243798, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 5.388145685195923 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.01106257, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00049472, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.4781966602652779, + "language_loss": 0.86544526, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88801324, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.60744047164917 + }, + { + "auxiliary_loss_clip": 0.01148901, + "auxiliary_loss_mlp": 0.01104863, + "balance_loss_clip": 1.0019964, + "balance_loss_mlp": 1.00053096, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.7340785583552798, + "language_loss": 0.78187162, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80440927, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.5361509323120117 + }, + { + "auxiliary_loss_clip": 0.01119568, + "auxiliary_loss_mlp": 0.01105948, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.0006628, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.4432942563421853, + "language_loss": 0.7776109, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79986608, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.658017873764038 + }, + { + "auxiliary_loss_clip": 0.01165525, + "auxiliary_loss_mlp": 0.01104585, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00053966, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 1.8663279064815708, + "language_loss": 0.74160236, + "learning_rate": 7.54412860030732e-07, + "loss": 0.76430345, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.515773296356201 + }, + { + "auxiliary_loss_clip": 0.01117325, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00067604, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.6666487887695158, + "language_loss": 0.77375233, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79597187, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.6264610290527344 + }, + { + "auxiliary_loss_clip": 0.01133644, + "auxiliary_loss_mlp": 0.01105525, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00043035, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.920813342681461, + "language_loss": 0.74058497, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76297665, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 4.154085874557495 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.0110582, + "balance_loss_clip": 1.00203264, + "balance_loss_mlp": 1.00053453, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.5475103163493966, + "language_loss": 0.77297473, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79550433, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 2.576232433319092 + }, + { + "auxiliary_loss_clip": 0.01117391, + "auxiliary_loss_mlp": 0.01105217, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00059915, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.5098988225601038, + "language_loss": 0.67948711, + "learning_rate": 7.531944002330073e-07, + "loss": 0.7017132, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.629979133605957 + }, + { + "auxiliary_loss_clip": 0.01149998, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_clip": 1.00184715, + "balance_loss_mlp": 1.00041163, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.8077064516397547, + "language_loss": 0.69496411, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71752203, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 2.599177837371826 + }, + { + "auxiliary_loss_clip": 0.01135766, + "auxiliary_loss_mlp": 0.01105678, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00058353, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.8185275832848569, + "language_loss": 0.70742917, + "learning_rate": 7.525854539619052e-07, + "loss": 0.72984362, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.627260684967041 + }, + { + "auxiliary_loss_clip": 0.01115291, + "auxiliary_loss_mlp": 0.01104248, + "balance_loss_clip": 1.00169623, + "balance_loss_mlp": 1.00067949, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 5.814424861814171, + "language_loss": 0.75671804, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77891338, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.5804636478424072 + }, + { + "auxiliary_loss_clip": 0.01150734, + "auxiliary_loss_mlp": 0.01104968, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00054026, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.248471763837558, + "language_loss": 0.76490307, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78746015, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.5495002269744873 + }, + { + "auxiliary_loss_clip": 0.01148901, + "auxiliary_loss_mlp": 0.0110562, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00062013, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 3.118974568559198, + "language_loss": 0.67726129, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69980651, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.602217435836792 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01106414, + "balance_loss_clip": 1.00159669, + "balance_loss_mlp": 1.00055647, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 1.8229202779085982, + "language_loss": 0.79315019, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81520712, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 2.704310417175293 + }, + { + "auxiliary_loss_clip": 0.01116862, + "auxiliary_loss_mlp": 0.01104987, + "balance_loss_clip": 1.00193822, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.9015301734158705, + "language_loss": 0.81885147, + "learning_rate": 7.510639162726e-07, + "loss": 0.84107, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.6214656829833984 + }, + { + "auxiliary_loss_clip": 0.01129299, + "auxiliary_loss_mlp": 0.01082771, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 0.99999255, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8083580076624528, + "language_loss": 0.61764497, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63976568, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.2164194583892822 + }, + { + "auxiliary_loss_clip": 0.01150655, + "auxiliary_loss_mlp": 0.01104583, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00044203, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.6198670908319177, + "language_loss": 0.78013813, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80269051, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00044119, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 1.748858090263906, + "language_loss": 0.81864977, + "learning_rate": 7.501515618840834e-07, + "loss": 0.84120035, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.575052499771118 + }, + { + "auxiliary_loss_clip": 0.0111994, + "auxiliary_loss_mlp": 0.01106434, + "balance_loss_clip": 1.00190961, + "balance_loss_mlp": 1.00057602, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 2.0049826741189185, + "language_loss": 0.74748123, + "learning_rate": 7.498475385279592e-07, + "loss": 0.76974499, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.6116039752960205 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00048935, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.6027895593727557, + "language_loss": 0.75116491, + "learning_rate": 7.495435625777423e-07, + "loss": 0.7733897, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.6167516708374023 + }, + { + "auxiliary_loss_clip": 0.01132359, + "auxiliary_loss_mlp": 0.0110396, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.0005815, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.6377406096297409, + "language_loss": 0.80876732, + "learning_rate": 7.492396340449578e-07, + "loss": 0.8311305, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.631197690963745 + }, + { + "auxiliary_loss_clip": 0.01085994, + "auxiliary_loss_mlp": 0.01104917, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00058472, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 2.037201739832341, + "language_loss": 0.6101073, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63201642, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.6630783081054688 + }, + { + "auxiliary_loss_clip": 0.01148235, + "auxiliary_loss_mlp": 0.01104567, + "balance_loss_clip": 1.00176013, + "balance_loss_mlp": 1.00061631, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.7338058483693028, + "language_loss": 0.67770982, + "learning_rate": 7.486319192777883e-07, + "loss": 0.70023781, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.546231985092163 + }, + { + "auxiliary_loss_clip": 0.01165612, + "auxiliary_loss_mlp": 0.01105619, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00061989, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.9802924081358169, + "language_loss": 0.72305059, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74576294, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.538092613220215 + }, + { + "auxiliary_loss_clip": 0.01165751, + "auxiliary_loss_mlp": 0.01105617, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00042689, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.5856417208347464, + "language_loss": 0.72089887, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74361253, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.5016698837280273 + }, + { + "auxiliary_loss_clip": 0.0116561, + "auxiliary_loss_mlp": 0.01105186, + "balance_loss_clip": 1.00195587, + "balance_loss_mlp": 1.00056815, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.6545773047669252, + "language_loss": 0.75731182, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78001976, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.545598268508911 + }, + { + "auxiliary_loss_clip": 0.01117487, + "auxiliary_loss_mlp": 0.01105059, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00063217, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.578399376328569, + "language_loss": 0.76783097, + "learning_rate": 7.474170592596301e-07, + "loss": 0.79005647, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.618961811065674 + }, + { + "auxiliary_loss_clip": 0.01150743, + "auxiliary_loss_mlp": 0.0110538, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00047565, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.163563571355537, + "language_loss": 0.63975334, + "learning_rate": 7.471134629714797e-07, + "loss": 0.66231459, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 2.5446348190307617 + }, + { + "auxiliary_loss_clip": 0.01117789, + "auxiliary_loss_mlp": 0.01105552, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00055218, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 1.8461544619507124, + "language_loss": 0.83348751, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85572094, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.6246531009674072 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01106306, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00044823, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.6896515842603195, + "language_loss": 0.64392126, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66614127, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 4.052567958831787 + }, + { + "auxiliary_loss_clip": 0.01165877, + "auxiliary_loss_mlp": 0.01106168, + "balance_loss_clip": 1.00216246, + "balance_loss_mlp": 1.00069153, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.4969252966168667, + "language_loss": 0.81578743, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83850789, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.489755392074585 + }, + { + "auxiliary_loss_clip": 0.01165615, + "auxiliary_loss_mlp": 0.01105518, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00061369, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.8074112616430027, + "language_loss": 0.72001952, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74273086, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.501671314239502 + }, + { + "auxiliary_loss_clip": 0.01118957, + "auxiliary_loss_mlp": 0.01105293, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00048411, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.841827632786113, + "language_loss": 0.70808971, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73033226, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.6199817657470703 + }, + { + "auxiliary_loss_clip": 0.01132108, + "auxiliary_loss_mlp": 0.01106358, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00040507, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6168149808931231, + "language_loss": 0.70029503, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72267973, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.6227903366088867 + }, + { + "auxiliary_loss_clip": 0.0113203, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_clip": 1.00126386, + "balance_loss_mlp": 1.00002563, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8264540609660496, + "language_loss": 0.53736305, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55950379, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.1745166778564453 + }, + { + "auxiliary_loss_clip": 0.01132056, + "auxiliary_loss_mlp": 0.01106852, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00042164, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 6.075164440053259, + "language_loss": 0.59925103, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62164015, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.54280686378479 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01082006, + "balance_loss_clip": 1.00140238, + "balance_loss_mlp": 0.99999017, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7127523562028564, + "language_loss": 0.53270411, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55451566, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 4.5904412269592285 + }, + { + "auxiliary_loss_clip": 0.01150996, + "auxiliary_loss_mlp": 0.01105036, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00070357, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.4905582700586115, + "language_loss": 0.72107565, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74363601, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 3.9850876331329346 + }, + { + "auxiliary_loss_clip": 0.01150793, + "auxiliary_loss_mlp": 0.0110553, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00062633, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.8790427292117398, + "language_loss": 0.74267852, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76524174, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 2.6394846439361572 + }, + { + "auxiliary_loss_clip": 0.01118036, + "auxiliary_loss_mlp": 0.01105442, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00044227, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 2.125001785920593, + "language_loss": 0.78083688, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80307168, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.630281686782837 + }, + { + "auxiliary_loss_clip": 0.011193, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.00064588, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.3070134151846915, + "language_loss": 0.68505353, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70730388, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.586777687072754 + }, + { + "auxiliary_loss_clip": 0.01117709, + "auxiliary_loss_mlp": 0.01104577, + "balance_loss_clip": 1.00166845, + "balance_loss_mlp": 1.00062609, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.6446207587857777, + "language_loss": 0.74231434, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76453716, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.6150240898132324 + }, + { + "auxiliary_loss_clip": 0.01165621, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00047493, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.4242275425740343, + "language_loss": 0.71233189, + "learning_rate": 7.425652262418368e-07, + "loss": 0.73502851, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 3.977132797241211 + }, + { + "auxiliary_loss_clip": 0.01098682, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.00188172, + "balance_loss_mlp": 1.00068176, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.712861645786621, + "language_loss": 0.62487811, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64692557, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 2.6456141471862793 + }, + { + "auxiliary_loss_clip": 0.01104138, + "auxiliary_loss_mlp": 0.01105788, + "balance_loss_clip": 1.00169778, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.9448501694608595, + "language_loss": 0.74865234, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7707516, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.6521337032318115 + }, + { + "auxiliary_loss_clip": 0.01149788, + "auxiliary_loss_mlp": 0.01104712, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00076151, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.7108669098090925, + "language_loss": 0.79055268, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81309766, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.563068151473999 + }, + { + "auxiliary_loss_clip": 0.01148936, + "auxiliary_loss_mlp": 0.01105087, + "balance_loss_clip": 1.0019666, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.9777889193422309, + "language_loss": 0.764274, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78681421, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.5914340019226074 + }, + { + "auxiliary_loss_clip": 0.01165615, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.00053096, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.6387507198498676, + "language_loss": 0.8144486, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83357859, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.4858756065368652 + }, + { + "auxiliary_loss_clip": 0.01105051, + "auxiliary_loss_mlp": 0.01106885, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00055051, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 2.2362562972056135, + "language_loss": 0.69251758, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71463692, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 2.6996989250183105 + }, + { + "auxiliary_loss_clip": 0.01117562, + "auxiliary_loss_mlp": 0.01103703, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00041974, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.5380060079904263, + "language_loss": 0.69948423, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72169691, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.632883071899414 + }, + { + "auxiliary_loss_clip": 0.01133971, + "auxiliary_loss_mlp": 0.01105056, + "balance_loss_clip": 1.00187325, + "balance_loss_mlp": 1.0005337, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.7291942079976466, + "language_loss": 0.90182644, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92421675, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.617230176925659 + }, + { + "auxiliary_loss_clip": 0.01146338, + "auxiliary_loss_mlp": 0.01082368, + "balance_loss_clip": 1.00125229, + "balance_loss_mlp": 0.99997061, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.66511110961808, + "language_loss": 0.56096768, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58325475, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.2877871990203857 + }, + { + "auxiliary_loss_clip": 0.01099575, + "auxiliary_loss_mlp": 0.01104622, + "balance_loss_clip": 1.00168777, + "balance_loss_mlp": 1.00048101, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.5809619407563915, + "language_loss": 0.76592982, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78797185, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.6953372955322266 + }, + { + "auxiliary_loss_clip": 0.01129193, + "auxiliary_loss_mlp": 0.01081994, + "balance_loss_clip": 1.00124741, + "balance_loss_mlp": 0.99997789, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7216696463508004, + "language_loss": 0.57058775, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59269953, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 3.0471420288085938 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.0108241, + "balance_loss_clip": 1.0014137, + "balance_loss_mlp": 1.00001228, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6663146634680104, + "language_loss": 0.55443901, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57624948, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.215891122817993 + }, + { + "auxiliary_loss_clip": 0.01116721, + "auxiliary_loss_mlp": 0.01104056, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00048685, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.781574680249364, + "language_loss": 0.79449904, + "learning_rate": 7.38632097810854e-07, + "loss": 0.81670678, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.659254550933838 + }, + { + "auxiliary_loss_clip": 0.01134097, + "auxiliary_loss_mlp": 0.01104449, + "balance_loss_clip": 1.00197065, + "balance_loss_mlp": 1.00059426, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.71505239063182, + "language_loss": 0.71830517, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74069065, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.6171326637268066 + }, + { + "auxiliary_loss_clip": 0.01165493, + "auxiliary_loss_mlp": 0.01105401, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00068784, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 2.1798358528245894, + "language_loss": 0.70095944, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72366834, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 2.4976935386657715 + }, + { + "auxiliary_loss_clip": 0.01117439, + "auxiliary_loss_mlp": 0.0110557, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00057065, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 2.0232716773640216, + "language_loss": 0.7824139, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80464399, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.621077537536621 + }, + { + "auxiliary_loss_clip": 0.0113411, + "auxiliary_loss_mlp": 0.01104792, + "balance_loss_clip": 1.00184858, + "balance_loss_mlp": 1.00046051, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 2.4452173396247763, + "language_loss": 0.7015568, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72394574, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.6783640384674072 + }, + { + "auxiliary_loss_clip": 0.01136411, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00049567, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 4.882367284182606, + "language_loss": 0.74494946, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76737332, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.616124153137207 + }, + { + "auxiliary_loss_clip": 0.01149304, + "auxiliary_loss_mlp": 0.01105607, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00051165, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.3839996824159095, + "language_loss": 0.63496768, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65751684, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 2.9272964000701904 + }, + { + "auxiliary_loss_clip": 0.01120017, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00048065, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.777330153894989, + "language_loss": 0.78645718, + "learning_rate": 7.365176060028912e-07, + "loss": 0.80871117, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.6553335189819336 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.0074608, + "balance_loss_clip": 1.00137281, + "balance_loss_mlp": 1.00052226, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8847329092136409, + "language_loss": 0.65009165, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66916585, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.0933711528778076 + }, + { + "auxiliary_loss_clip": 0.01144665, + "auxiliary_loss_mlp": 0.01082789, + "balance_loss_clip": 1.00127625, + "balance_loss_mlp": 1.00001037, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7234349000823149, + "language_loss": 0.59300238, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61527693, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.2221503257751465 + }, + { + "auxiliary_loss_clip": 0.01100286, + "auxiliary_loss_mlp": 0.01104993, + "balance_loss_clip": 1.00176251, + "balance_loss_mlp": 1.00047052, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 2.1340123492500753, + "language_loss": 0.6476264, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66967916, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.692991018295288 + }, + { + "auxiliary_loss_clip": 0.01102597, + "auxiliary_loss_mlp": 0.01104711, + "balance_loss_clip": 1.00167966, + "balance_loss_mlp": 1.00037909, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.7717270638051095, + "language_loss": 0.69559664, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71766973, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 4.0379064083099365 + }, + { + "auxiliary_loss_clip": 0.01165739, + "auxiliary_loss_mlp": 0.01105553, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00055313, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.8421274600015, + "language_loss": 0.81576359, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83847654, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.4858546257019043 + }, + { + "auxiliary_loss_clip": 0.01151146, + "auxiliary_loss_mlp": 0.01106684, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00044429, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.7306234334530137, + "language_loss": 0.76779997, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79037821, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.5849478244781494 + }, + { + "auxiliary_loss_clip": 0.0116583, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_clip": 1.00202167, + "balance_loss_mlp": 1.00056005, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.6228339115934804, + "language_loss": 0.72780848, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75051856, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.5382072925567627 + }, + { + "auxiliary_loss_clip": 0.01165762, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 9.215315586643337, + "language_loss": 0.77567065, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79838645, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.5324013233184814 + }, + { + "auxiliary_loss_clip": 0.0114894, + "auxiliary_loss_mlp": 0.01106079, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00060225, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.6227441103692275, + "language_loss": 0.72501671, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74756682, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.5445027351379395 + }, + { + "auxiliary_loss_clip": 0.01118966, + "auxiliary_loss_mlp": 0.0110555, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.00064611, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.5483415617375684, + "language_loss": 0.69333929, + "learning_rate": 7.335009768593938e-07, + "loss": 0.7155844, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.6954681873321533 + }, + { + "auxiliary_loss_clip": 0.01165836, + "auxiliary_loss_mlp": 0.01106581, + "balance_loss_clip": 1.00202715, + "balance_loss_mlp": 1.00053275, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 1.6428791438943549, + "language_loss": 0.79166311, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81438726, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.510366201400757 + }, + { + "auxiliary_loss_clip": 0.01149185, + "auxiliary_loss_mlp": 0.01106002, + "balance_loss_clip": 1.00172246, + "balance_loss_mlp": 1.00071621, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.797734131626191, + "language_loss": 0.73738241, + "learning_rate": 7.328982269740221e-07, + "loss": 0.75993425, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 5.381840944290161 + }, + { + "auxiliary_loss_clip": 0.01136283, + "auxiliary_loss_mlp": 0.01105697, + "balance_loss_clip": 1.00194824, + "balance_loss_mlp": 1.00069714, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.6595590308713941, + "language_loss": 0.70952737, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73194718, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.618340015411377 + }, + { + "auxiliary_loss_clip": 0.01084592, + "auxiliary_loss_mlp": 0.01106778, + "balance_loss_clip": 1.0016166, + "balance_loss_mlp": 1.00053883, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.67517473736118, + "language_loss": 0.77375364, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79566741, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.7997775077819824 + }, + { + "auxiliary_loss_clip": 0.01151161, + "auxiliary_loss_mlp": 0.00747566, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00055003, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 1.8566330581452934, + "language_loss": 0.71550286, + "learning_rate": 7.319944625392205e-07, + "loss": 0.7344901, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 2.5338668823242188 + }, + { + "auxiliary_loss_clip": 0.01150031, + "auxiliary_loss_mlp": 0.01105496, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00059164, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.714019544580216, + "language_loss": 0.61043537, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63299072, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.6535911560058594 + }, + { + "auxiliary_loss_clip": 0.01132545, + "auxiliary_loss_mlp": 0.01106357, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00068974, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.4817841829748224, + "language_loss": 0.75721121, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77960014, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.6027352809906006 + }, + { + "auxiliary_loss_clip": 0.01119392, + "auxiliary_loss_mlp": 0.01104169, + "balance_loss_clip": 1.00164628, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 2.009820326200171, + "language_loss": 0.84690291, + "learning_rate": 7.310911308504808e-07, + "loss": 0.86913848, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 4.023151159286499 + }, + { + "auxiliary_loss_clip": 0.01148684, + "auxiliary_loss_mlp": 0.0110505, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00062251, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.7900015105393643, + "language_loss": 0.77529132, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79782867, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.5601422786712646 + }, + { + "auxiliary_loss_clip": 0.01165884, + "auxiliary_loss_mlp": 0.01105038, + "balance_loss_clip": 1.00215495, + "balance_loss_mlp": 1.00061047, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 1.8370972555343366, + "language_loss": 0.72446907, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74717832, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.4890055656433105 + }, + { + "auxiliary_loss_clip": 0.01148991, + "auxiliary_loss_mlp": 0.00747532, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.0005821, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.747830115278533, + "language_loss": 0.76913249, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78809774, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.601247787475586 + }, + { + "auxiliary_loss_clip": 0.0113382, + "auxiliary_loss_mlp": 0.01105584, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.000489, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 2.726626571241238, + "language_loss": 0.67571551, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69810951, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 2.999690055847168 + }, + { + "auxiliary_loss_clip": 0.01150826, + "auxiliary_loss_mlp": 0.01106257, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.0006851, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.5681684140835548, + "language_loss": 0.72508228, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74765313, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.5575032234191895 + }, + { + "auxiliary_loss_clip": 0.01150992, + "auxiliary_loss_mlp": 0.01106292, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00072074, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.3650479043632915, + "language_loss": 0.74687684, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76944971, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.59328031539917 + }, + { + "auxiliary_loss_clip": 0.01117417, + "auxiliary_loss_mlp": 0.01104531, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00058055, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.96788838953388, + "language_loss": 0.82257223, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84479171, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.6508665084838867 + }, + { + "auxiliary_loss_clip": 0.01148961, + "auxiliary_loss_mlp": 0.01105196, + "balance_loss_clip": 1.00199115, + "balance_loss_mlp": 1.0005784, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 3.0030928522283933, + "language_loss": 0.81783366, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8403753, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.539471387863159 + }, + { + "auxiliary_loss_clip": 0.0113217, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00039768, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.5464026577415035, + "language_loss": 0.66597652, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68834937, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.6265177726745605 + }, + { + "auxiliary_loss_clip": 0.01118932, + "auxiliary_loss_mlp": 0.01104056, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00048661, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.8035188395186106, + "language_loss": 0.65914524, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68137509, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.718996047973633 + }, + { + "auxiliary_loss_clip": 0.01165617, + "auxiliary_loss_mlp": 0.01105135, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00061226, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.013033896111874, + "language_loss": 0.75378114, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77648866, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.507927656173706 + }, + { + "auxiliary_loss_clip": 0.01150676, + "auxiliary_loss_mlp": 0.01106965, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.0006299, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 1.8562704924137727, + "language_loss": 0.70276737, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72534382, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.6631367206573486 + }, + { + "auxiliary_loss_clip": 0.01151143, + "auxiliary_loss_mlp": 0.01105055, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00062799, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.3746262568880876, + "language_loss": 0.75020182, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77276385, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.8474419116973877 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.01106102, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00062525, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5192064500436229, + "language_loss": 0.6706934, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69341159, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.6273226737976074 + }, + { + "auxiliary_loss_clip": 0.01116842, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00059605, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 2.105368943607679, + "language_loss": 0.63332069, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65554976, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.585751533508301 + }, + { + "auxiliary_loss_clip": 0.01115378, + "auxiliary_loss_mlp": 0.01105225, + "balance_loss_clip": 1.00156522, + "balance_loss_mlp": 1.00041568, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.5366929250290855, + "language_loss": 0.57994407, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60215008, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.7093019485473633 + }, + { + "auxiliary_loss_clip": 0.01085066, + "auxiliary_loss_mlp": 0.0110469, + "balance_loss_clip": 1.00164294, + "balance_loss_mlp": 1.00045371, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.060005324482301, + "language_loss": 0.7384268, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76032436, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.6907386779785156 + }, + { + "auxiliary_loss_clip": 0.01150872, + "auxiliary_loss_mlp": 0.01104399, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00054348, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 1.9521823151153586, + "language_loss": 0.66850805, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69106078, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.5636842250823975 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01105889, + "balance_loss_clip": 1.00184572, + "balance_loss_mlp": 1.00060356, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.6455351166284793, + "language_loss": 0.7344867, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75671875, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.6210153102874756 + }, + { + "auxiliary_loss_clip": 0.01130183, + "auxiliary_loss_mlp": 0.01105106, + "balance_loss_clip": 1.00192249, + "balance_loss_mlp": 1.00058389, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 1.7843313901447018, + "language_loss": 0.68190086, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70425379, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.6202335357666016 + }, + { + "auxiliary_loss_clip": 0.01165885, + "auxiliary_loss_mlp": 0.01106516, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00056279, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.8219784520339048, + "language_loss": 0.59806079, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62078482, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.5069236755371094 + }, + { + "auxiliary_loss_clip": 0.01149315, + "auxiliary_loss_mlp": 0.01105904, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00061798, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.6815850121551323, + "language_loss": 0.73083007, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75338221, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 2.5608489513397217 + }, + { + "auxiliary_loss_clip": 0.01150912, + "auxiliary_loss_mlp": 0.01105657, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.0004667, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.8420599947548364, + "language_loss": 0.69158584, + "learning_rate": 7.241799976651807e-07, + "loss": 0.7141515, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.539614200592041 + }, + { + "auxiliary_loss_clip": 0.0110268, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.00077724, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.6146621828403096, + "language_loss": 0.84287429, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86495221, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 4.12647557258606 + }, + { + "auxiliary_loss_clip": 0.01165759, + "auxiliary_loss_mlp": 0.01105674, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00048411, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.0884412417697034, + "language_loss": 0.81928527, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84199959, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.487945318222046 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01104827, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.00087619, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.767332143242186, + "language_loss": 0.79056716, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81279188, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 2.6074557304382324 + }, + { + "auxiliary_loss_clip": 0.01165661, + "auxiliary_loss_mlp": 0.01106451, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00059307, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.562989603167754, + "language_loss": 0.6941818, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71690297, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.5383522510528564 + }, + { + "auxiliary_loss_clip": 0.01102481, + "auxiliary_loss_mlp": 0.01104338, + "balance_loss_clip": 1.00161004, + "balance_loss_mlp": 1.00048316, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.6760884755395824, + "language_loss": 0.8655746, + "learning_rate": 7.226809591715923e-07, + "loss": 0.88764274, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.683591842651367 + }, + { + "auxiliary_loss_clip": 0.01119581, + "auxiliary_loss_mlp": 0.01105693, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00059807, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 1.7055022200313308, + "language_loss": 0.83077514, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85302794, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.666595220565796 + }, + { + "auxiliary_loss_clip": 0.01131689, + "auxiliary_loss_mlp": 0.01105756, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.7125541051698268, + "language_loss": 0.67220789, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69458234, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.627573251724243 + }, + { + "auxiliary_loss_clip": 0.0115093, + "auxiliary_loss_mlp": 0.01106417, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00065446, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8157918490605023, + "language_loss": 0.7494688, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77204221, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 3.94315242767334 + }, + { + "auxiliary_loss_clip": 0.01128126, + "auxiliary_loss_mlp": 0.01082033, + "balance_loss_clip": 1.00136507, + "balance_loss_mlp": 1.00001705, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8280966376351188, + "language_loss": 0.58724463, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60934621, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 4.439079761505127 + }, + { + "auxiliary_loss_clip": 0.01119012, + "auxiliary_loss_mlp": 0.01104093, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00052381, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 1.9305552422543315, + "language_loss": 0.68963403, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71186507, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.6378397941589355 + }, + { + "auxiliary_loss_clip": 0.01134321, + "auxiliary_loss_mlp": 0.01105398, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00058877, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 3.063709499951549, + "language_loss": 0.65147185, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67386913, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.6406164169311523 + }, + { + "auxiliary_loss_clip": 0.01165429, + "auxiliary_loss_mlp": 0.01104578, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00043726, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 1.9410551821107513, + "language_loss": 0.74069929, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76339936, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 2.53363299369812 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01105395, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00058591, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.5660760692405296, + "language_loss": 0.69635075, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71876645, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.586230754852295 + }, + { + "auxiliary_loss_clip": 0.01114934, + "auxiliary_loss_mlp": 0.01104048, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00066972, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.5166957046824396, + "language_loss": 0.77434492, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79653478, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 4.037017822265625 + }, + { + "auxiliary_loss_clip": 0.01148847, + "auxiliary_loss_mlp": 0.01104858, + "balance_loss_clip": 1.00186837, + "balance_loss_mlp": 1.00062108, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.984321919014622, + "language_loss": 0.79167742, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81421447, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.5263547897338867 + }, + { + "auxiliary_loss_clip": 0.01102808, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.00056136, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8248111376966762, + "language_loss": 0.720079, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74214745, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.670557975769043 + }, + { + "auxiliary_loss_clip": 0.01133867, + "auxiliary_loss_mlp": 0.01106276, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00070405, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.7221884247365942, + "language_loss": 0.71351635, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73591781, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.5988175868988037 + }, + { + "auxiliary_loss_clip": 0.0111943, + "auxiliary_loss_mlp": 0.01105451, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.0006423, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.5573983071474773, + "language_loss": 0.62224734, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64449614, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.7141165733337402 + }, + { + "auxiliary_loss_clip": 0.01149112, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.00173211, + "balance_loss_mlp": 1.00052893, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8381676100750692, + "language_loss": 0.74557149, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76453757, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.6003785133361816 + }, + { + "auxiliary_loss_clip": 0.01149373, + "auxiliary_loss_mlp": 0.00747497, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.8346377759381005, + "language_loss": 0.74427789, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76324654, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.581566333770752 + }, + { + "auxiliary_loss_clip": 0.01117316, + "auxiliary_loss_mlp": 0.01104769, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.00053263, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.150940513561719, + "language_loss": 0.72019345, + "learning_rate": 7.178921802463702e-07, + "loss": 0.7424143, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.598867654800415 + }, + { + "auxiliary_loss_clip": 0.01148765, + "auxiliary_loss_mlp": 0.01104449, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00059342, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.5906285442603902, + "language_loss": 0.73515618, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75768834, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.6135661602020264 + }, + { + "auxiliary_loss_clip": 0.01132329, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_clip": 1.00202191, + "balance_loss_mlp": 1.0006249, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.963687756413007, + "language_loss": 0.55204308, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57442641, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.6664464473724365 + }, + { + "auxiliary_loss_clip": 0.01118919, + "auxiliary_loss_mlp": 0.01104516, + "balance_loss_clip": 1.00188482, + "balance_loss_mlp": 1.00047052, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.4605843292418783, + "language_loss": 0.7262069, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74844122, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.655304431915283 + }, + { + "auxiliary_loss_clip": 0.01165614, + "auxiliary_loss_mlp": 0.01104453, + "balance_loss_clip": 1.00196731, + "balance_loss_mlp": 1.00069284, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.8005287244587729, + "language_loss": 0.73775661, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76045728, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.5042200088500977 + }, + { + "auxiliary_loss_clip": 0.01117548, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.00161433, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.2458362678917076, + "language_loss": 0.66501868, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68724644, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.6331470012664795 + }, + { + "auxiliary_loss_clip": 0.01132169, + "auxiliary_loss_mlp": 0.0110518, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00046706, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.6813561789330302, + "language_loss": 0.79124105, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81361461, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.5556480884552 + }, + { + "auxiliary_loss_clip": 0.01117568, + "auxiliary_loss_mlp": 0.01104048, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00057411, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.5899542616100442, + "language_loss": 0.91319668, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93541288, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.622776985168457 + }, + { + "auxiliary_loss_clip": 0.01165651, + "auxiliary_loss_mlp": 0.01104816, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.0005796, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.6936432920641804, + "language_loss": 0.6211924, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64389706, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.5454602241516113 + }, + { + "auxiliary_loss_clip": 0.01165714, + "auxiliary_loss_mlp": 0.01105348, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00053954, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.7688844870550255, + "language_loss": 0.75227261, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77498329, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.4859583377838135 + }, + { + "auxiliary_loss_clip": 0.01129033, + "auxiliary_loss_mlp": 0.00746391, + "balance_loss_clip": 1.00106621, + "balance_loss_mlp": 1.00076056, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.7176024092296518, + "language_loss": 0.5674932, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58624744, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.144552230834961 + }, + { + "auxiliary_loss_clip": 0.0113365, + "auxiliary_loss_mlp": 0.01105532, + "balance_loss_clip": 1.00177419, + "balance_loss_mlp": 1.00053263, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.5428774992581253, + "language_loss": 0.73811233, + "learning_rate": 7.146071116474451e-07, + "loss": 0.76050413, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.5736703872680664 + }, + { + "auxiliary_loss_clip": 0.0116567, + "auxiliary_loss_mlp": 0.01105022, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00049961, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 5.304220596192745, + "language_loss": 0.84037912, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86308599, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.512420654296875 + }, + { + "auxiliary_loss_clip": 0.01117766, + "auxiliary_loss_mlp": 0.01105511, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00070226, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 2.074272400009787, + "language_loss": 0.77662659, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79885936, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.6685307025909424 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01105352, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00054336, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5509385317002244, + "language_loss": 0.79409289, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81629896, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.670896053314209 + }, + { + "auxiliary_loss_clip": 0.01149045, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00050104, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.6423312157042174, + "language_loss": 0.67404187, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69659209, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.522394895553589 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00046933, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.8459556641117936, + "language_loss": 0.66127163, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68332815, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.668607234954834 + }, + { + "auxiliary_loss_clip": 0.0113332, + "auxiliary_loss_mlp": 0.01104026, + "balance_loss_clip": 1.00166667, + "balance_loss_mlp": 1.00055218, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.622304971973331, + "language_loss": 0.81845731, + "learning_rate": 7.128177409391851e-07, + "loss": 0.8408308, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 4.037911653518677 + }, + { + "auxiliary_loss_clip": 0.01119061, + "auxiliary_loss_mlp": 0.01104071, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 1.923150794616964, + "language_loss": 0.75313276, + "learning_rate": 7.125196832571367e-07, + "loss": 0.7753641, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.6010615825653076 + }, + { + "auxiliary_loss_clip": 0.01148955, + "auxiliary_loss_mlp": 0.01104116, + "balance_loss_clip": 1.00184071, + "balance_loss_mlp": 1.00045156, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 3.980498462025156, + "language_loss": 0.72613406, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74866474, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.521873712539673 + }, + { + "auxiliary_loss_clip": 0.01134253, + "auxiliary_loss_mlp": 0.01105965, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00058353, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.51379681200397, + "language_loss": 0.85685468, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87925684, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.618225574493408 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.0110602, + "balance_loss_clip": 1.00183904, + "balance_loss_mlp": 1.00063884, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 1.9633406167767684, + "language_loss": 0.73597872, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75837243, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.5725178718566895 + }, + { + "auxiliary_loss_clip": 0.01149099, + "auxiliary_loss_mlp": 0.0110559, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00049484, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 2.3088801101837317, + "language_loss": 0.72718585, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74973273, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.516721487045288 + }, + { + "auxiliary_loss_clip": 0.01119146, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.0017879, + "balance_loss_mlp": 1.00046098, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 5.5886752815206036, + "language_loss": 0.69598818, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71465445, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.6844820976257324 + }, + { + "auxiliary_loss_clip": 0.01148616, + "auxiliary_loss_mlp": 0.01106216, + "balance_loss_clip": 1.00202739, + "balance_loss_mlp": 1.00045323, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.8620694696198048, + "language_loss": 0.66511136, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68765962, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.599597454071045 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01104863, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00053072, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4386906392037548, + "language_loss": 0.68401009, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70638597, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 5.466645956039429 + }, + { + "auxiliary_loss_clip": 0.01100569, + "auxiliary_loss_mlp": 0.01104996, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00066388, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.463340915674424, + "language_loss": 0.72543252, + "learning_rate": 7.101369803195391e-07, + "loss": 0.74748826, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.66481351852417 + }, + { + "auxiliary_loss_clip": 0.01150872, + "auxiliary_loss_mlp": 0.01106339, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00057673, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.848703732232277, + "language_loss": 0.76623893, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78881109, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.5661203861236572 + }, + { + "auxiliary_loss_clip": 0.01131827, + "auxiliary_loss_mlp": 0.01104924, + "balance_loss_clip": 1.00206649, + "balance_loss_mlp": 1.0004971, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 2.035769968516887, + "language_loss": 0.79638243, + "learning_rate": 7.095417934766781e-07, + "loss": 0.8187499, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.5855109691619873 + }, + { + "auxiliary_loss_clip": 0.0114882, + "auxiliary_loss_mlp": 0.01104218, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00083983, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.5433357981207394, + "language_loss": 0.76875687, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79128718, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.5745856761932373 + }, + { + "auxiliary_loss_clip": 0.01150676, + "auxiliary_loss_mlp": 0.01105739, + "balance_loss_clip": 1.0018636, + "balance_loss_mlp": 1.00054896, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.437046210449988, + "language_loss": 0.81884527, + "learning_rate": 7.089468023710326e-07, + "loss": 0.84140944, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 3.9886527061462402 + }, + { + "auxiliary_loss_clip": 0.01149289, + "auxiliary_loss_mlp": 0.01105596, + "balance_loss_clip": 1.0018872, + "balance_loss_mlp": 1.00059688, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.8149375445908489, + "language_loss": 0.70213306, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72468191, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.6085097789764404 + }, + { + "auxiliary_loss_clip": 0.01165553, + "auxiliary_loss_mlp": 0.01104949, + "balance_loss_clip": 1.00194132, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.4136142793306887, + "language_loss": 0.69376642, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71647143, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 2.512577772140503 + }, + { + "auxiliary_loss_clip": 0.01165625, + "auxiliary_loss_mlp": 0.01104641, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.0006901, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.4930393864673028, + "language_loss": 0.65831012, + "learning_rate": 7.080546829172564e-07, + "loss": 0.68101275, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.595454216003418 + }, + { + "auxiliary_loss_clip": 0.01165716, + "auxiliary_loss_mlp": 0.0110602, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00044787, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.5114690778420576, + "language_loss": 0.61218452, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63490188, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.4916844367980957 + }, + { + "auxiliary_loss_clip": 0.01101093, + "auxiliary_loss_mlp": 0.01104485, + "balance_loss_clip": 1.00161934, + "balance_loss_mlp": 1.00053442, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 2.286906872926608, + "language_loss": 0.73955458, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76161039, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.6405563354492188 + }, + { + "auxiliary_loss_clip": 0.0116571, + "auxiliary_loss_mlp": 0.01104815, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00048316, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.6261338045002303, + "language_loss": 0.80963206, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83233726, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.579707145690918 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01104638, + "balance_loss_clip": 1.00184453, + "balance_loss_mlp": 1.00049615, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 1.8191979742612372, + "language_loss": 0.77084196, + "learning_rate": 7.068658762345488e-07, + "loss": 0.79322076, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.596895217895508 + }, + { + "auxiliary_loss_clip": 0.01148932, + "auxiliary_loss_mlp": 0.01105047, + "balance_loss_clip": 1.00199556, + "balance_loss_mlp": 1.00052476, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.4363380282086553, + "language_loss": 0.76624835, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78878814, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.564324378967285 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.0110432, + "balance_loss_clip": 1.00159931, + "balance_loss_mlp": 1.00055981, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.048744220393198, + "language_loss": 0.74261844, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76481336, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.590587854385376 + }, + { + "auxiliary_loss_clip": 0.01135599, + "auxiliary_loss_mlp": 0.01105502, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00059807, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 1.8041257491357143, + "language_loss": 0.82158136, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84399241, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.6314635276794434 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01104926, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.0005939, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 2.2283074604606234, + "language_loss": 0.74581653, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76818311, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.560161590576172 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.00747517, + "balance_loss_clip": 1.00173199, + "balance_loss_mlp": 1.00054932, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 2.3275635712645166, + "language_loss": 0.79450101, + "learning_rate": 7.053809712705396e-07, + "loss": 0.8134644, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.6410772800445557 + }, + { + "auxiliary_loss_clip": 0.01150539, + "auxiliary_loss_mlp": 0.0074745, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00053942, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 3.064614440543522, + "language_loss": 0.71823514, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73721504, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.544196367263794 + }, + { + "auxiliary_loss_clip": 0.01165796, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00057471, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.3774680048667325, + "language_loss": 0.71127021, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73398674, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.586942195892334 + }, + { + "auxiliary_loss_clip": 0.01149309, + "auxiliary_loss_mlp": 0.01106327, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.0006603, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 2.424926367766853, + "language_loss": 0.72658581, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74914217, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.532640218734741 + }, + { + "auxiliary_loss_clip": 0.01130104, + "auxiliary_loss_mlp": 0.0108205, + "balance_loss_clip": 1.00130343, + "balance_loss_mlp": 1.00003386, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.752770563253112, + "language_loss": 0.65220207, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67432362, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.1500649452209473 + }, + { + "auxiliary_loss_clip": 0.01165523, + "auxiliary_loss_mlp": 0.01105398, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00030279, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.7139504445647902, + "language_loss": 0.80106533, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82377446, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.514073610305786 + }, + { + "auxiliary_loss_clip": 0.0114662, + "auxiliary_loss_mlp": 0.01105453, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.9277864711569555, + "language_loss": 0.73415208, + "learning_rate": 7.036007054761508e-07, + "loss": 0.7566728, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.543997049331665 + }, + { + "auxiliary_loss_clip": 0.01165694, + "auxiliary_loss_mlp": 0.0110636, + "balance_loss_clip": 1.00204408, + "balance_loss_mlp": 1.00059748, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.5688836585580421, + "language_loss": 0.88594079, + "learning_rate": 7.033041665033716e-07, + "loss": 0.90866137, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.519373655319214 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01105466, + "balance_loss_clip": 1.00180018, + "balance_loss_mlp": 1.00046623, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.6639265846165694, + "language_loss": 0.74995148, + "learning_rate": 7.030076767014284e-07, + "loss": 0.77204823, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.6778323650360107 + }, + { + "auxiliary_loss_clip": 0.01115875, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00042629, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.9465292694495386, + "language_loss": 0.8248921, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84710324, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.634186267852783 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.01106382, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00071454, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.7672127695469568, + "language_loss": 0.71703869, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73927915, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.6390557289123535 + }, + { + "auxiliary_loss_clip": 0.01165586, + "auxiliary_loss_mlp": 0.01105454, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.0004549, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5599635754879315, + "language_loss": 0.6931743, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71588469, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.5689351558685303 + }, + { + "auxiliary_loss_clip": 0.01148988, + "auxiliary_loss_mlp": 0.01105517, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00061297, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.5854398417260156, + "language_loss": 0.73090589, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75345105, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.5515050888061523 + }, + { + "auxiliary_loss_clip": 0.01146698, + "auxiliary_loss_mlp": 0.01105578, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.0005784, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.8772270458189615, + "language_loss": 0.76747143, + "learning_rate": 7.015259656476911e-07, + "loss": 0.78999418, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 4.017709016799927 + }, + { + "auxiliary_loss_clip": 0.01148935, + "auxiliary_loss_mlp": 0.01106103, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00043547, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.6151242173273657, + "language_loss": 0.70654118, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72909153, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.5443332195281982 + }, + { + "auxiliary_loss_clip": 0.01165567, + "auxiliary_loss_mlp": 0.01105375, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00066137, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.78572526722726, + "language_loss": 0.72338045, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74608982, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 2.483980894088745 + }, + { + "auxiliary_loss_clip": 0.01165558, + "auxiliary_loss_mlp": 0.01105464, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00046468, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.7141135038809243, + "language_loss": 0.71546865, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73817885, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.557016611099243 + }, + { + "auxiliary_loss_clip": 0.01085164, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.00158334, + "balance_loss_mlp": 1.00058818, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 2.0556667215311837, + "language_loss": 0.78289616, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80122387, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.680208921432495 + }, + { + "auxiliary_loss_clip": 0.01087021, + "auxiliary_loss_mlp": 0.01104232, + "balance_loss_clip": 1.00171244, + "balance_loss_mlp": 1.00047255, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.757569260257649, + "language_loss": 0.7408669, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76277941, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 2.783511161804199 + }, + { + "auxiliary_loss_clip": 0.0113434, + "auxiliary_loss_mlp": 0.01105647, + "balance_loss_clip": 1.00187719, + "balance_loss_mlp": 1.00055194, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.4138009034223646, + "language_loss": 0.76972115, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79212099, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.5550036430358887 + }, + { + "auxiliary_loss_clip": 0.01117318, + "auxiliary_loss_mlp": 0.01105794, + "balance_loss_clip": 1.00185096, + "balance_loss_mlp": 1.00069964, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.7079372891639422, + "language_loss": 0.61483192, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63706303, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.73370623588562 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.00747449, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00057256, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.6540567588414876, + "language_loss": 0.52004397, + "learning_rate": 6.991577889352264e-07, + "loss": 0.53871137, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 5.652894496917725 + }, + { + "auxiliary_loss_clip": 0.01134278, + "auxiliary_loss_mlp": 0.01105186, + "balance_loss_clip": 1.00181377, + "balance_loss_mlp": 1.00037718, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.7244853673916671, + "language_loss": 0.68584412, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70823878, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.591707706451416 + }, + { + "auxiliary_loss_clip": 0.01132763, + "auxiliary_loss_mlp": 0.0110683, + "balance_loss_clip": 1.00197875, + "balance_loss_mlp": 1.0005908, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.607436851964792, + "language_loss": 0.66267353, + "learning_rate": 6.985662378133474e-07, + "loss": 0.6850695, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.6290764808654785 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.01104774, + "balance_loss_clip": 1.00187016, + "balance_loss_mlp": 1.000633, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 1.8992738373690377, + "language_loss": 0.77130765, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79367518, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.635385751724243 + }, + { + "auxiliary_loss_clip": 0.01098554, + "auxiliary_loss_mlp": 0.01105066, + "balance_loss_clip": 1.00192714, + "balance_loss_mlp": 1.00054288, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.57307156985865, + "language_loss": 0.79933399, + "learning_rate": 6.979748840934601e-07, + "loss": 0.82137024, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.6976640224456787 + }, + { + "auxiliary_loss_clip": 0.01117038, + "auxiliary_loss_mlp": 0.01104502, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00055146, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.8064933573362025, + "language_loss": 0.71566498, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73788041, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 4.20776629447937 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01082024, + "balance_loss_clip": 1.00130439, + "balance_loss_mlp": 1.00000846, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7756992434564172, + "language_loss": 0.54775846, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56987977, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.236525535583496 + }, + { + "auxiliary_loss_clip": 0.01165539, + "auxiliary_loss_mlp": 0.01104887, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00046003, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.354945066013316, + "language_loss": 0.80238259, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82508683, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 2.5533175468444824 + }, + { + "auxiliary_loss_clip": 0.01165498, + "auxiliary_loss_mlp": 0.01104638, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00040185, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.4487881255225783, + "language_loss": 0.79007995, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81278133, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.534489870071411 + }, + { + "auxiliary_loss_clip": 0.01165464, + "auxiliary_loss_mlp": 0.01105024, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00040555, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 5.115728478235809, + "language_loss": 0.76527834, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78798324, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.498903512954712 + }, + { + "auxiliary_loss_clip": 0.01133939, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00051761, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.7924520910338728, + "language_loss": 0.71959686, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74199045, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.627804756164551 + }, + { + "auxiliary_loss_clip": 0.01165698, + "auxiliary_loss_mlp": 0.01104506, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00055552, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.781362215875918, + "language_loss": 0.68812776, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71082979, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.549880027770996 + }, + { + "auxiliary_loss_clip": 0.01161198, + "auxiliary_loss_mlp": 0.01081596, + "balance_loss_clip": 1.00136256, + "balance_loss_mlp": 0.99996156, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7267130808340023, + "language_loss": 0.5435546, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56598258, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 3.0071890354156494 + }, + { + "auxiliary_loss_clip": 0.01165747, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 2.1213843070591567, + "language_loss": 0.70270699, + "learning_rate": 6.953162376079233e-07, + "loss": 0.7254166, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.494845390319824 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.01104689, + "balance_loss_clip": 1.00198853, + "balance_loss_mlp": 1.00064313, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 5.0307119168990875, + "language_loss": 0.72941595, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75181007, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.5991218090057373 + }, + { + "auxiliary_loss_clip": 0.01165744, + "auxiliary_loss_mlp": 0.01106835, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00059521, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 2.5671597916418145, + "language_loss": 0.7811482, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80387396, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.573568344116211 + }, + { + "auxiliary_loss_clip": 0.01115561, + "auxiliary_loss_mlp": 0.01104012, + "balance_loss_clip": 1.00164485, + "balance_loss_mlp": 1.00053835, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 1.903083377255904, + "language_loss": 0.7810806, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80327636, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.6129379272460938 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.01104824, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00049245, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.9871274662723688, + "language_loss": 0.72121704, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74330187, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.680386543273926 + }, + { + "auxiliary_loss_clip": 0.01135999, + "auxiliary_loss_mlp": 0.01104325, + "balance_loss_clip": 1.00180376, + "balance_loss_mlp": 1.00056553, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.6729042777806789, + "language_loss": 0.74892092, + "learning_rate": 6.938409428408061e-07, + "loss": 0.7713241, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.6588213443756104 + }, + { + "auxiliary_loss_clip": 0.01150874, + "auxiliary_loss_mlp": 0.01105249, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.0004406, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 2.0485562626254716, + "language_loss": 0.65972811, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68228936, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.566913604736328 + }, + { + "auxiliary_loss_clip": 0.01135429, + "auxiliary_loss_mlp": 0.011051, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00048172, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.6745059217130782, + "language_loss": 0.69099116, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71339649, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.620033025741577 + }, + { + "auxiliary_loss_clip": 0.01100082, + "auxiliary_loss_mlp": 0.01104314, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.00036335, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.498303772640081, + "language_loss": 0.6589843, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68102825, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.7066965103149414 + }, + { + "auxiliary_loss_clip": 0.01150958, + "auxiliary_loss_mlp": 0.01104616, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00047445, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.7635286018921537, + "language_loss": 0.7154665, + "learning_rate": 6.926615984942332e-07, + "loss": 0.73802221, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.534292697906494 + }, + { + "auxiliary_loss_clip": 0.01117303, + "auxiliary_loss_mlp": 0.01104662, + "balance_loss_clip": 1.00176489, + "balance_loss_mlp": 1.00052094, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.6142234754394678, + "language_loss": 0.72418529, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74640489, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.7034294605255127 + }, + { + "auxiliary_loss_clip": 0.01165658, + "auxiliary_loss_mlp": 0.01105949, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00056744, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7327481115154288, + "language_loss": 0.75951725, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78223336, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.5164365768432617 + }, + { + "auxiliary_loss_clip": 0.01133972, + "auxiliary_loss_mlp": 0.01104356, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00040519, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.4318997696288749, + "language_loss": 0.66881335, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69119668, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.6307456493377686 + }, + { + "auxiliary_loss_clip": 0.0115048, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.0006454, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.7867516239137378, + "language_loss": 0.63661933, + "learning_rate": 6.914830473380749e-07, + "loss": 0.6591835, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.5807788372039795 + }, + { + "auxiliary_loss_clip": 0.01134377, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_clip": 1.00192511, + "balance_loss_mlp": 1.00051689, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.4872913823675251, + "language_loss": 0.63558328, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65796983, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.582598924636841 + }, + { + "auxiliary_loss_clip": 0.01132577, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00060177, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.8222891439966973, + "language_loss": 0.73497832, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75736487, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.6313982009887695 + }, + { + "auxiliary_loss_clip": 0.01085463, + "auxiliary_loss_mlp": 0.01105091, + "balance_loss_clip": 1.00166571, + "balance_loss_mlp": 1.00047302, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.1206896290932056, + "language_loss": 0.72361642, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74552196, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.691964864730835 + }, + { + "auxiliary_loss_clip": 0.01148998, + "auxiliary_loss_mlp": 0.01106573, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00061965, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 3.401073450681126, + "language_loss": 0.64453781, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66709352, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 4.057724237442017 + }, + { + "auxiliary_loss_clip": 0.01134223, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_clip": 1.00185835, + "balance_loss_mlp": 1.00054073, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.862392021840187, + "language_loss": 0.74687362, + "learning_rate": 6.900109749061874e-07, + "loss": 0.7692703, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.5653059482574463 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.0110535, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00054121, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.5589678320332416, + "language_loss": 0.73438251, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75709212, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.517617702484131 + }, + { + "auxiliary_loss_clip": 0.01150875, + "auxiliary_loss_mlp": 0.01106082, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00050986, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 1.9473479939380596, + "language_loss": 0.60258389, + "learning_rate": 6.894224935797017e-07, + "loss": 0.62515342, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.619685649871826 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01104604, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00046277, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.000607570719027, + "language_loss": 0.85944992, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88184184, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.5780229568481445 + }, + { + "auxiliary_loss_clip": 0.01148976, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00064313, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.6555764115936782, + "language_loss": 0.69070011, + "learning_rate": 6.888342110421364e-07, + "loss": 0.70966578, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.7124080657958984 + }, + { + "auxiliary_loss_clip": 0.01054117, + "auxiliary_loss_mlp": 0.01104626, + "balance_loss_clip": 1.00148916, + "balance_loss_mlp": 1.00048435, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.5371849177533743, + "language_loss": 0.71941853, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74100602, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.890500545501709 + }, + { + "auxiliary_loss_clip": 0.01118947, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00049651, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.7223546367629692, + "language_loss": 0.72504938, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74730241, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 4.484215021133423 + }, + { + "auxiliary_loss_clip": 0.0112956, + "auxiliary_loss_mlp": 0.01103957, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00057888, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.29394749613181, + "language_loss": 0.79081541, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81315053, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 4.807214260101318 + }, + { + "auxiliary_loss_clip": 0.01150403, + "auxiliary_loss_mlp": 0.01104857, + "balance_loss_clip": 1.00193155, + "balance_loss_mlp": 1.00071549, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 2.11591206477584, + "language_loss": 0.82532358, + "learning_rate": 6.876582426906565e-07, + "loss": 0.84787619, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 3.4139633178710938 + }, + { + "auxiliary_loss_clip": 0.01150209, + "auxiliary_loss_mlp": 0.01104223, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00036824, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.7258995109260638, + "language_loss": 0.79406834, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81661266, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 3.9406800270080566 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01104398, + "balance_loss_clip": 1.00173938, + "balance_loss_mlp": 1.00044727, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 4.765766059320475, + "language_loss": 0.79245001, + "learning_rate": 6.870705570551145e-07, + "loss": 0.8145175, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 3.190777540206909 + }, + { + "auxiliary_loss_clip": 0.01151016, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00059557, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.865933004192571, + "language_loss": 0.73958814, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76215905, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 3.3275511264801025 + }, + { + "auxiliary_loss_clip": 0.01151148, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00045872, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.538599065312778, + "language_loss": 0.6977486, + "learning_rate": 6.864830705652347e-07, + "loss": 0.7203061, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 4.918547630310059 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01104891, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00046337, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5337747574136624, + "language_loss": 0.73587513, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75809473, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 3.0173280239105225 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01103708, + "balance_loss_clip": 1.00170672, + "balance_loss_mlp": 1.00042462, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.0704569215644573, + "language_loss": 0.73542595, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75779676, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.922027587890625 + }, + { + "auxiliary_loss_clip": 0.01148813, + "auxiliary_loss_mlp": 0.01104495, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00054407, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.5295499586681887, + "language_loss": 0.73977238, + "learning_rate": 6.856022144234526e-07, + "loss": 0.7623055, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.9075169563293457 + }, + { + "auxiliary_loss_clip": 0.01133761, + "auxiliary_loss_mlp": 0.01105965, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.000489, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 1.7376140808980476, + "language_loss": 0.72843397, + "learning_rate": 6.853086953788727e-07, + "loss": 0.75083125, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.7438735961914062 + }, + { + "auxiliary_loss_clip": 0.01134635, + "auxiliary_loss_mlp": 0.01105551, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00045633, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.758030592272944, + "language_loss": 0.76709485, + "learning_rate": 6.850152261875189e-07, + "loss": 0.78949666, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.6416738033294678 + }, + { + "auxiliary_loss_clip": 0.01100871, + "auxiliary_loss_mlp": 0.0110538, + "balance_loss_clip": 1.00165367, + "balance_loss_mlp": 1.00047553, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.5595267056021225, + "language_loss": 0.71570796, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73777044, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 4.0128984451293945 + }, + { + "auxiliary_loss_clip": 0.01149101, + "auxiliary_loss_mlp": 0.01104978, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00055051, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.7060352616377499, + "language_loss": 0.65339327, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67593402, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.9053359031677246 + }, + { + "auxiliary_loss_clip": 0.01105029, + "auxiliary_loss_mlp": 0.01106315, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00064778, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.8780995661964273, + "language_loss": 0.79253578, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81464922, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.713571310043335 + }, + { + "auxiliary_loss_clip": 0.01165559, + "auxiliary_loss_mlp": 0.00747466, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00054419, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.119440918969471, + "language_loss": 0.76139933, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78052962, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.5254979133605957 + }, + { + "auxiliary_loss_clip": 0.01133938, + "auxiliary_loss_mlp": 0.01105485, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00058091, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.4467468142787245, + "language_loss": 0.69275343, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71514761, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.5718185901641846 + }, + { + "auxiliary_loss_clip": 0.01148986, + "auxiliary_loss_mlp": 0.01106181, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.00051403, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 5.983919869507663, + "language_loss": 0.75383699, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77638865, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.603938341140747 + }, + { + "auxiliary_loss_clip": 0.01148923, + "auxiliary_loss_mlp": 0.01104717, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00048041, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.7892002436753975, + "language_loss": 0.73969281, + "learning_rate": 6.829623386729182e-07, + "loss": 0.7622292, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.7275002002716064 + }, + { + "auxiliary_loss_clip": 0.01150962, + "auxiliary_loss_mlp": 0.01105219, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00060141, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.4602473084999972, + "language_loss": 0.77821422, + "learning_rate": 6.826692687078362e-07, + "loss": 0.800776, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.5707075595855713 + }, + { + "auxiliary_loss_clip": 0.01148948, + "auxiliary_loss_mlp": 0.01104898, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.000471, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.412272001993305, + "language_loss": 0.65926665, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68180513, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.6000146865844727 + }, + { + "auxiliary_loss_clip": 0.01149032, + "auxiliary_loss_mlp": 0.01105302, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00049305, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 3.6365896828222306, + "language_loss": 0.73441958, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75696295, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.6200783252716064 + }, + { + "auxiliary_loss_clip": 0.01151023, + "auxiliary_loss_mlp": 0.01105707, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00061178, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.9506273286124722, + "language_loss": 0.73456794, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75713521, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.6028759479522705 + }, + { + "auxiliary_loss_clip": 0.01134195, + "auxiliary_loss_mlp": 0.01106459, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00060058, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 1.8882213546067113, + "language_loss": 0.6728611, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69526768, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.6325135231018066 + }, + { + "auxiliary_loss_clip": 0.01165551, + "auxiliary_loss_mlp": 0.01105118, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00049973, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 9.819137963884543, + "language_loss": 0.88693821, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90964484, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 2.51910662651062 + }, + { + "auxiliary_loss_clip": 0.01165291, + "auxiliary_loss_mlp": 0.01103389, + "balance_loss_clip": 1.00194657, + "balance_loss_mlp": 1.0004878, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.5164755125758083, + "language_loss": 0.67167908, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69436586, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.6067028045654297 + }, + { + "auxiliary_loss_clip": 0.01165489, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00062609, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.6497135647135253, + "language_loss": 0.79989713, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82258928, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.622150182723999 + }, + { + "auxiliary_loss_clip": 0.0115081, + "auxiliary_loss_mlp": 0.01105881, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00040483, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.637466437107295, + "language_loss": 0.74390543, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76647234, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.6476197242736816 + }, + { + "auxiliary_loss_clip": 0.01149203, + "auxiliary_loss_mlp": 0.01105742, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00055206, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.741464256628954, + "language_loss": 0.73651123, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75906068, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.6491332054138184 + }, + { + "auxiliary_loss_clip": 0.0111493, + "auxiliary_loss_mlp": 0.01104087, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00051785, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.1561881513905843, + "language_loss": 0.83176053, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85395074, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.648735284805298 + }, + { + "auxiliary_loss_clip": 0.01165542, + "auxiliary_loss_mlp": 0.01105424, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00071084, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.9559860367793254, + "language_loss": 0.73215532, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75486499, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.591758966445923 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.01106416, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.0005585, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 1.900688560639912, + "language_loss": 0.69981873, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72224659, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 5.170344352722168 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.0110514, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00061727, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.6445099056654868, + "language_loss": 0.69427156, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71683073, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 3.394679307937622 + }, + { + "auxiliary_loss_clip": 0.01130328, + "auxiliary_loss_mlp": 0.01105351, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00054264, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.1897242416420224, + "language_loss": 0.68036926, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70272601, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 3.3046681880950928 + }, + { + "auxiliary_loss_clip": 0.01131617, + "auxiliary_loss_mlp": 0.01104042, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00047278, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 2.170942979744144, + "language_loss": 0.78013217, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80248874, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 3.0064196586608887 + }, + { + "auxiliary_loss_clip": 0.01165391, + "auxiliary_loss_mlp": 0.01105078, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00055492, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 2.051152366392299, + "language_loss": 0.83562922, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85833389, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.6378211975097656 + }, + { + "auxiliary_loss_clip": 0.01132604, + "auxiliary_loss_mlp": 0.00747677, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.00070024, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.9387305808397093, + "language_loss": 0.73563462, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75443745, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.742671489715576 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01105805, + "balance_loss_clip": 1.00174832, + "balance_loss_mlp": 1.00061464, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.6330172123068125, + "language_loss": 0.73025304, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75248325, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.7038588523864746 + }, + { + "auxiliary_loss_clip": 0.01165638, + "auxiliary_loss_mlp": 0.01104843, + "balance_loss_clip": 1.00194538, + "balance_loss_mlp": 1.0004158, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.219649555313791, + "language_loss": 0.76993132, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79263616, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 8.589046478271484 + }, + { + "auxiliary_loss_clip": 0.01165506, + "auxiliary_loss_mlp": 0.01105314, + "balance_loss_clip": 1.00199449, + "balance_loss_mlp": 1.00069594, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.677837923459664, + "language_loss": 0.78533518, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80804336, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 3.943450450897217 + }, + { + "auxiliary_loss_clip": 0.01148821, + "auxiliary_loss_mlp": 0.00747491, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00057673, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0153098641730414, + "language_loss": 0.72304833, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74201143, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.54937481880188 + }, + { + "auxiliary_loss_clip": 0.01150028, + "auxiliary_loss_mlp": 0.01104929, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00050223, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.381230392720539, + "language_loss": 0.8554529, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87800246, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 2.5686984062194824 + }, + { + "auxiliary_loss_clip": 0.01117626, + "auxiliary_loss_mlp": 0.01105398, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00058877, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.694034647540584, + "language_loss": 0.72568047, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74791074, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.6762218475341797 + }, + { + "auxiliary_loss_clip": 0.01102288, + "auxiliary_loss_mlp": 0.01105507, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00050783, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.5571626021297598, + "language_loss": 0.61081421, + "learning_rate": 6.756506010719711e-07, + "loss": 0.63289213, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 4.177778959274292 + }, + { + "auxiliary_loss_clip": 0.01115353, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_clip": 1.00176251, + "balance_loss_mlp": 1.00047874, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.9366474544738874, + "language_loss": 0.68233418, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70454627, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.733747959136963 + }, + { + "auxiliary_loss_clip": 0.01165562, + "auxiliary_loss_mlp": 0.00747484, + "balance_loss_clip": 1.00202024, + "balance_loss_mlp": 1.0005126, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.647588627295743, + "language_loss": 0.76241565, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78154612, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.6952974796295166 + }, + { + "auxiliary_loss_clip": 0.01150994, + "auxiliary_loss_mlp": 0.0110549, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00058544, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.7270478818350246, + "language_loss": 0.69177222, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71433699, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 2.5840201377868652 + }, + { + "auxiliary_loss_clip": 0.0113331, + "auxiliary_loss_mlp": 0.01106104, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00043702, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.8076828848157989, + "language_loss": 0.79819542, + "learning_rate": 6.744836312865602e-07, + "loss": 0.82058954, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.685269594192505 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01104589, + "balance_loss_clip": 1.00155818, + "balance_loss_mlp": 1.00044799, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 1.9274677362168093, + "language_loss": 0.65353251, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67558479, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.7041447162628174 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01104617, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00038004, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.8786767965849882, + "language_loss": 0.76634008, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78872108, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.6827328205108643 + }, + { + "auxiliary_loss_clip": 0.01149012, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.00199783, + "balance_loss_mlp": 1.00060296, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 2.2160072351527482, + "language_loss": 0.57843196, + "learning_rate": 6.736089316777684e-07, + "loss": 0.59739858, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 2.80330228805542 + }, + { + "auxiliary_loss_clip": 0.01161154, + "auxiliary_loss_mlp": 0.00746111, + "balance_loss_clip": 1.00134683, + "balance_loss_mlp": 1.00056803, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6372201107519331, + "language_loss": 0.4929232, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51199585, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 4.116251230239868 + }, + { + "auxiliary_loss_clip": 0.01147064, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00057125, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 2.9572815087947983, + "language_loss": 0.67194551, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69447762, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 3.591409206390381 + }, + { + "auxiliary_loss_clip": 0.01096301, + "auxiliary_loss_mlp": 0.01081264, + "balance_loss_clip": 1.00115311, + "balance_loss_mlp": 1.00001049, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9820962501028841, + "language_loss": 0.6079942, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6297698, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 3.4196698665618896 + }, + { + "auxiliary_loss_clip": 0.01102077, + "auxiliary_loss_mlp": 0.01104902, + "balance_loss_clip": 1.00185716, + "balance_loss_mlp": 1.00047469, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 4.495338745679776, + "language_loss": 0.67835397, + "learning_rate": 6.724433697406191e-07, + "loss": 0.70042378, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.96716570854187 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01106155, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00048733, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.857469962101474, + "language_loss": 0.83490741, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85747218, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.651507616043091 + }, + { + "auxiliary_loss_clip": 0.01117152, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_clip": 1.00171101, + "balance_loss_mlp": 1.00047517, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.5812685806388662, + "language_loss": 0.73061848, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75283521, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.7443413734436035 + }, + { + "auxiliary_loss_clip": 0.01149869, + "auxiliary_loss_mlp": 0.01104077, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.0006988, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.7020130722112865, + "language_loss": 0.79256576, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8151052, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 11.115556478500366 + }, + { + "auxiliary_loss_clip": 0.01165493, + "auxiliary_loss_mlp": 0.01104461, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.8618970621836024, + "language_loss": 0.67283344, + "learning_rate": 6.712786132607182e-07, + "loss": 0.69553292, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.688244342803955 + }, + { + "auxiliary_loss_clip": 0.01134129, + "auxiliary_loss_mlp": 0.01104896, + "balance_loss_clip": 1.00169444, + "balance_loss_mlp": 1.00056398, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.19619935484248, + "language_loss": 0.68833339, + "learning_rate": 6.709875500762645e-07, + "loss": 0.71072364, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.632490396499634 + }, + { + "auxiliary_loss_clip": 0.01131573, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00051045, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.8491656854176841, + "language_loss": 0.74540377, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76777077, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 3.31697416305542 + }, + { + "auxiliary_loss_clip": 0.01130583, + "auxiliary_loss_mlp": 0.01082909, + "balance_loss_clip": 1.00189388, + "balance_loss_mlp": 1.00013018, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7212910081658882, + "language_loss": 0.60879743, + "learning_rate": 6.704055749072455e-07, + "loss": 0.63093233, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 4.355502128601074 + }, + { + "auxiliary_loss_clip": 0.01130217, + "auxiliary_loss_mlp": 0.01105191, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00057268, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.6019956909604183, + "language_loss": 0.8035962, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82595026, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 3.4784817695617676 + }, + { + "auxiliary_loss_clip": 0.01165474, + "auxiliary_loss_mlp": 0.0110418, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00042021, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.8608826943596881, + "language_loss": 0.72992754, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75262409, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 4.2313618659973145 + }, + { + "auxiliary_loss_clip": 0.01165661, + "auxiliary_loss_mlp": 0.01105599, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00069511, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 2.2824170237741694, + "language_loss": 0.73849446, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76120704, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 4.955905199050903 + }, + { + "auxiliary_loss_clip": 0.01165361, + "auxiliary_loss_mlp": 0.01104238, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00038338, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.6134325738698938, + "language_loss": 0.54016566, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56286168, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 3.734922409057617 + }, + { + "auxiliary_loss_clip": 0.01133938, + "auxiliary_loss_mlp": 0.01104677, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00063062, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.7257742935592368, + "language_loss": 0.83823943, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86062562, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 3.8645260334014893 + }, + { + "auxiliary_loss_clip": 0.01130571, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 0.9999938, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8844124766931533, + "language_loss": 0.5769757, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59909767, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.837409019470215 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01104936, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00050867, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 2.1790119596775566, + "language_loss": 0.81762081, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83999133, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 8.256426572799683 + }, + { + "auxiliary_loss_clip": 0.01148929, + "auxiliary_loss_mlp": 0.01104487, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00053632, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7120307821822536, + "language_loss": 0.69857979, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72111392, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 3.5379979610443115 + }, + { + "auxiliary_loss_clip": 0.01129479, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.651345925151968, + "language_loss": 0.81166875, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83400238, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 3.8303873538970947 + }, + { + "auxiliary_loss_clip": 0.011488, + "auxiliary_loss_mlp": 0.01105021, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00049877, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 2.133725985462857, + "language_loss": 0.72476602, + "learning_rate": 6.674987259277692e-07, + "loss": 0.7473042, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 3.3853821754455566 + }, + { + "auxiliary_loss_clip": 0.01118595, + "auxiliary_loss_mlp": 0.01105717, + "balance_loss_clip": 1.00201356, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.7891726625637423, + "language_loss": 0.88344169, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90568483, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 5.859495162963867 + }, + { + "auxiliary_loss_clip": 0.01084118, + "auxiliary_loss_mlp": 0.01104502, + "balance_loss_clip": 1.00143731, + "balance_loss_mlp": 1.00045574, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.6266603887961824, + "language_loss": 0.80267596, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82456213, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 3.8874671459198 + }, + { + "auxiliary_loss_clip": 0.01104622, + "auxiliary_loss_mlp": 0.01104812, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00057518, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.8061828391955161, + "language_loss": 0.78003478, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80212903, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 4.196818113327026 + }, + { + "auxiliary_loss_clip": 0.0109657, + "auxiliary_loss_mlp": 0.01105132, + "balance_loss_clip": 1.00163841, + "balance_loss_mlp": 1.0006094, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 1.7959521222207078, + "language_loss": 0.78783333, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80985034, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 3.430112600326538 + }, + { + "auxiliary_loss_clip": 0.01144099, + "auxiliary_loss_mlp": 0.01081621, + "balance_loss_clip": 1.00135565, + "balance_loss_mlp": 0.99998635, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.843444804738549, + "language_loss": 0.55177581, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57403302, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 9.998960494995117 + }, + { + "auxiliary_loss_clip": 0.01148699, + "auxiliary_loss_mlp": 0.01105153, + "balance_loss_clip": 1.00202072, + "balance_loss_mlp": 1.00053525, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.4212950000872824, + "language_loss": 0.79577124, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81830972, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 3.784567356109619 + }, + { + "auxiliary_loss_clip": 0.01148668, + "auxiliary_loss_mlp": 0.01104538, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00049233, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.7216417186741433, + "language_loss": 0.75204015, + "learning_rate": 6.654669374367275e-07, + "loss": 0.77457225, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 3.9124433994293213 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01103855, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00047684, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.7372679862786982, + "language_loss": 0.81752062, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83985794, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 3.4737017154693604 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01105437, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00043762, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 1.9196956944194088, + "language_loss": 0.76448822, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78687835, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 11.607338905334473 + }, + { + "auxiliary_loss_clip": 0.01131843, + "auxiliary_loss_mlp": 0.01104436, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00058138, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 1.9364535289546785, + "language_loss": 0.64183891, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66420174, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 3.9355509281158447 + }, + { + "auxiliary_loss_clip": 0.01148935, + "auxiliary_loss_mlp": 0.01106919, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00058389, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.773850517898171, + "language_loss": 0.8274318, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84999037, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 3.5544841289520264 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01106992, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00075316, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 2.001635364785333, + "language_loss": 0.72127724, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74366903, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 3.429988384246826 + }, + { + "auxiliary_loss_clip": 0.01150438, + "auxiliary_loss_mlp": 0.00747483, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.00061369, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.8110423983235622, + "language_loss": 0.6388731, + "learning_rate": 6.637273779206183e-07, + "loss": 0.65785229, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 7.374166965484619 + }, + { + "auxiliary_loss_clip": 0.01116951, + "auxiliary_loss_mlp": 0.01104817, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00048494, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.3313853733734973, + "language_loss": 0.76136637, + "learning_rate": 6.634376286210559e-07, + "loss": 0.783584, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 4.313975095748901 + }, + { + "auxiliary_loss_clip": 0.01132476, + "auxiliary_loss_mlp": 0.01104839, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.00031614, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.8330222798968938, + "language_loss": 0.74491227, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76728541, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 4.141408443450928 + }, + { + "auxiliary_loss_clip": 0.01100311, + "auxiliary_loss_mlp": 0.01105292, + "balance_loss_clip": 1.00168061, + "balance_loss_mlp": 1.00048327, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.728002743645465, + "language_loss": 0.68289906, + "learning_rate": 6.628582820806545e-07, + "loss": 0.7049551, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 3.5671751499176025 + }, + { + "auxiliary_loss_clip": 0.01115296, + "auxiliary_loss_mlp": 0.01104849, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00051737, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.6266699787774443, + "language_loss": 0.89619374, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91839516, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 5.266681432723999 + }, + { + "auxiliary_loss_clip": 0.01165491, + "auxiliary_loss_mlp": 0.0110525, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00053668, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.776566811154572, + "language_loss": 0.85682595, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87953335, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 4.7535881996154785 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.01104213, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00045371, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.6755774289861491, + "language_loss": 0.66678023, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68931025, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 3.8430519104003906 + }, + { + "auxiliary_loss_clip": 0.01117613, + "auxiliary_loss_mlp": 0.01106388, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00043511, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.6500067778226777, + "language_loss": 0.66796744, + "learning_rate": 6.617001975422647e-07, + "loss": 0.69020748, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 3.760739803314209 + }, + { + "auxiliary_loss_clip": 0.01118228, + "auxiliary_loss_mlp": 0.01106586, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00044239, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1.8830581775485002, + "language_loss": 0.85462475, + "learning_rate": 6.614108032513823e-07, + "loss": 0.8768729, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 3.541934013366699 + }, + { + "auxiliary_loss_clip": 0.01068961, + "auxiliary_loss_mlp": 0.01104976, + "balance_loss_clip": 1.00167322, + "balance_loss_mlp": 1.00045323, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.7394043141985736, + "language_loss": 0.69474852, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71648788, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 7.410583019256592 + }, + { + "auxiliary_loss_clip": 0.01165659, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00064945, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 2.3501626552637016, + "language_loss": 0.63437498, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65707755, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 3.5173134803771973 + }, + { + "auxiliary_loss_clip": 0.01132408, + "auxiliary_loss_mlp": 0.01104175, + "balance_loss_clip": 1.00181854, + "balance_loss_mlp": 1.0006063, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 3.4005341804469738, + "language_loss": 0.71332967, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73569548, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 4.028907060623169 + }, + { + "auxiliary_loss_clip": 0.01113193, + "auxiliary_loss_mlp": 0.01105209, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00040007, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.6152472187655575, + "language_loss": 0.82354224, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84572631, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 3.3700475692749023 + }, + { + "auxiliary_loss_clip": 0.01165554, + "auxiliary_loss_mlp": 0.01105974, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00059295, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.3457737115617427, + "language_loss": 0.75155282, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77426809, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 5.496667861938477 + }, + { + "auxiliary_loss_clip": 0.01102568, + "auxiliary_loss_mlp": 0.01104872, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00044417, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.7082767684000069, + "language_loss": 0.73532557, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75739998, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 4.699608325958252 + }, + { + "auxiliary_loss_clip": 0.01134283, + "auxiliary_loss_mlp": 0.01104875, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00054288, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.6753040527546934, + "language_loss": 0.76668406, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78907561, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 3.9548592567443848 + }, + { + "auxiliary_loss_clip": 0.01150559, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00045681, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7499743016973324, + "language_loss": 0.73134041, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75388908, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 3.592081308364868 + }, + { + "auxiliary_loss_clip": 0.01135138, + "auxiliary_loss_mlp": 0.01104887, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00046003, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.6430810438590715, + "language_loss": 0.79547608, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81787622, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 6.203531742095947 + }, + { + "auxiliary_loss_clip": 0.01114853, + "auxiliary_loss_mlp": 0.01105248, + "balance_loss_clip": 1.00177896, + "balance_loss_mlp": 1.00053477, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.408191274747309, + "language_loss": 0.75608152, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77828252, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 5.919054746627808 + }, + { + "auxiliary_loss_clip": 0.01133678, + "auxiliary_loss_mlp": 0.01103711, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.00061893, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.5135997089963125, + "language_loss": 0.80169451, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82406843, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 2.768613815307617 + }, + { + "auxiliary_loss_clip": 0.01118808, + "auxiliary_loss_mlp": 0.01105068, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00044942, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.7345130622688514, + "language_loss": 0.7755208, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79775953, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 3.122097969055176 + }, + { + "auxiliary_loss_clip": 0.01133586, + "auxiliary_loss_mlp": 0.01105418, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00051427, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 2.4050833616687872, + "language_loss": 0.67484945, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69723952, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 2.630072832107544 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01105193, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00047946, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.4177213897871432, + "language_loss": 0.81609404, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83813077, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 4.149939060211182 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01104893, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00056148, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 2.016442544816203, + "language_loss": 0.70783991, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73005587, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.625750780105591 + }, + { + "auxiliary_loss_clip": 0.01148844, + "auxiliary_loss_mlp": 0.01104804, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00066304, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.502207809812574, + "language_loss": 0.73364663, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75618315, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.738386869430542 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00046563, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7256211957703744, + "language_loss": 0.80655736, + "learning_rate": 6.564988754473642e-07, + "loss": 0.8289535, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.5966899394989014 + }, + { + "auxiliary_loss_clip": 0.0116535, + "auxiliary_loss_mlp": 0.01104647, + "balance_loss_clip": 1.00197327, + "balance_loss_mlp": 1.00060081, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.693781960712913, + "language_loss": 0.72371846, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74641848, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.6775214672088623 + }, + { + "auxiliary_loss_clip": 0.01134148, + "auxiliary_loss_mlp": 0.01105668, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00057316, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.056714436374304, + "language_loss": 0.78366303, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80606115, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.6455421447753906 + }, + { + "auxiliary_loss_clip": 0.01101325, + "auxiliary_loss_mlp": 0.01104827, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.00059032, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 3.7529018654936044, + "language_loss": 0.75212562, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77418715, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.7915024757385254 + }, + { + "auxiliary_loss_clip": 0.01087218, + "auxiliary_loss_mlp": 0.0110489, + "balance_loss_clip": 1.00175989, + "balance_loss_mlp": 1.0005579, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.7310785859925237, + "language_loss": 0.81379533, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83571643, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.722177505493164 + }, + { + "auxiliary_loss_clip": 0.01149948, + "auxiliary_loss_mlp": 0.01104711, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00056958, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.7111539372536038, + "language_loss": 0.718503, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74104953, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 4.277949810028076 + }, + { + "auxiliary_loss_clip": 0.01150804, + "auxiliary_loss_mlp": 0.01105465, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00065613, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.7398233224233437, + "language_loss": 0.72033465, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74289739, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.619194269180298 + }, + { + "auxiliary_loss_clip": 0.01144569, + "auxiliary_loss_mlp": 0.01081643, + "balance_loss_clip": 1.00127864, + "balance_loss_mlp": 1.00000894, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6973664034182016, + "language_loss": 0.59463024, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61689234, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.2477638721466064 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01104105, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00044084, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.5082980262089065, + "language_loss": 0.67431533, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69701111, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.481580972671509 + }, + { + "auxiliary_loss_clip": 0.01150732, + "auxiliary_loss_mlp": 0.00747573, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.0005753, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.7439988254605274, + "language_loss": 0.72180241, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74078548, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 4.205312728881836 + }, + { + "auxiliary_loss_clip": 0.01133934, + "auxiliary_loss_mlp": 0.01103212, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00050116, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.640870100844553, + "language_loss": 0.65055251, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67292392, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.5693130493164062 + }, + { + "auxiliary_loss_clip": 0.01100597, + "auxiliary_loss_mlp": 0.01105017, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00039899, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 2.3194082508087535, + "language_loss": 0.80749333, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82954949, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.6924710273742676 + }, + { + "auxiliary_loss_clip": 0.01149175, + "auxiliary_loss_mlp": 0.01104644, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00050306, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.8106665215316755, + "language_loss": 0.68117201, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7037102, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.513596773147583 + }, + { + "auxiliary_loss_clip": 0.01148901, + "auxiliary_loss_mlp": 0.00747537, + "balance_loss_clip": 1.00169587, + "balance_loss_mlp": 1.00054908, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6885514813346056, + "language_loss": 0.72657752, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74554187, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.610344648361206 + }, + { + "auxiliary_loss_clip": 0.01102726, + "auxiliary_loss_mlp": 0.01105038, + "balance_loss_clip": 1.00178111, + "balance_loss_mlp": 1.00042021, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.8612545728664647, + "language_loss": 0.56136352, + "learning_rate": 6.524648112660027e-07, + "loss": 0.5834412, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.664372444152832 + }, + { + "auxiliary_loss_clip": 0.01119258, + "auxiliary_loss_mlp": 0.01103957, + "balance_loss_clip": 1.00189173, + "balance_loss_mlp": 1.00057864, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.5371662169966192, + "language_loss": 0.77438116, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79661334, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.6601014137268066 + }, + { + "auxiliary_loss_clip": 0.01134262, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00071907, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.5714467988302796, + "language_loss": 0.78027159, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80265802, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.6024365425109863 + }, + { + "auxiliary_loss_clip": 0.01148663, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_clip": 1.00178862, + "balance_loss_mlp": 1.00065744, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.9458937947799761, + "language_loss": 0.78559321, + "learning_rate": 6.516016709364604e-07, + "loss": 0.8081221, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.6564137935638428 + }, + { + "auxiliary_loss_clip": 0.01133864, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_clip": 1.00162959, + "balance_loss_mlp": 1.00046408, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.516330681245612, + "language_loss": 0.76894915, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79133385, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.885313034057617 + }, + { + "auxiliary_loss_clip": 0.01150584, + "auxiliary_loss_mlp": 0.01104053, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00048399, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.4587932059109008, + "language_loss": 0.71341991, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73596632, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.01105076, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00055289, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.7369507925951952, + "language_loss": 0.74487525, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76711893, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.669264793395996 + }, + { + "auxiliary_loss_clip": 0.01148653, + "auxiliary_loss_mlp": 0.01104317, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.0005573, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.5712515582627613, + "language_loss": 0.69002789, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71255755, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.715731143951416 + }, + { + "auxiliary_loss_clip": 0.01132231, + "auxiliary_loss_mlp": 0.00747395, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00046217, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 2.1150041855654207, + "language_loss": 0.75708246, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77587873, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.5774786472320557 + }, + { + "auxiliary_loss_clip": 0.01165579, + "auxiliary_loss_mlp": 0.01104554, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.0006032, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.6420281116348932, + "language_loss": 0.78260374, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80530512, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.548811912536621 + }, + { + "auxiliary_loss_clip": 0.0113393, + "auxiliary_loss_mlp": 0.01104747, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00041533, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.5604035866917543, + "language_loss": 0.69634873, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71873546, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.627551317214966 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01104587, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00054145, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.4219232437298475, + "language_loss": 0.75023508, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77262646, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.5691184997558594 + }, + { + "auxiliary_loss_clip": 0.01103214, + "auxiliary_loss_mlp": 0.01105551, + "balance_loss_clip": 1.0015831, + "balance_loss_mlp": 1.00055194, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 2.8155457787862823, + "language_loss": 0.76948619, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79157388, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.6671504974365234 + }, + { + "auxiliary_loss_clip": 0.01083216, + "auxiliary_loss_mlp": 0.0110467, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.00052905, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.1105563309685302, + "language_loss": 0.76322716, + "learning_rate": 6.487278616990774e-07, + "loss": 0.785106, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.660878896713257 + }, + { + "auxiliary_loss_clip": 0.01148612, + "auxiliary_loss_mlp": 0.0110376, + "balance_loss_clip": 1.00182903, + "balance_loss_mlp": 1.00047755, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8623463558438298, + "language_loss": 0.77265811, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79518187, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.521446704864502 + }, + { + "auxiliary_loss_clip": 0.01119067, + "auxiliary_loss_mlp": 0.01104111, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00035131, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 5.289654084143968, + "language_loss": 0.79391146, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81614327, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.665125608444214 + }, + { + "auxiliary_loss_clip": 0.01149168, + "auxiliary_loss_mlp": 0.01104538, + "balance_loss_clip": 1.00181735, + "balance_loss_mlp": 1.00039697, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.883175231976145, + "language_loss": 0.67291486, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69545197, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 2.9146220684051514 + }, + { + "auxiliary_loss_clip": 0.0111713, + "auxiliary_loss_mlp": 0.01105662, + "balance_loss_clip": 1.00158143, + "balance_loss_mlp": 1.00056732, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 2.7942545454197623, + "language_loss": 0.71957958, + "learning_rate": 6.475797721245648e-07, + "loss": 0.74180746, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.7000484466552734 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.00747547, + "balance_loss_clip": 1.00181329, + "balance_loss_mlp": 1.00062215, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.8177999867976475, + "language_loss": 0.65305632, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67170179, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.632312774658203 + }, + { + "auxiliary_loss_clip": 0.01148876, + "auxiliary_loss_mlp": 0.01104844, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00060761, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.793748862507845, + "language_loss": 0.78636909, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80890632, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 2.5522663593292236 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01106088, + "balance_loss_clip": 1.00187218, + "balance_loss_mlp": 1.0006119, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 1.8584197331891428, + "language_loss": 0.72477478, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74700725, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.61187744140625 + }, + { + "auxiliary_loss_clip": 0.01112499, + "auxiliary_loss_mlp": 0.01081264, + "balance_loss_clip": 1.00105214, + "balance_loss_mlp": 1.00001097, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.645593309842037, + "language_loss": 0.5462538, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56819147, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 4.812243461608887 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01104833, + "balance_loss_clip": 1.00165653, + "balance_loss_mlp": 1.00059676, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 2.1707456924438704, + "language_loss": 0.75987583, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78226662, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 2.582876443862915 + }, + { + "auxiliary_loss_clip": 0.01149043, + "auxiliary_loss_mlp": 0.01103963, + "balance_loss_clip": 1.00175834, + "balance_loss_mlp": 1.00039375, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.0930374911731198, + "language_loss": 0.79473519, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81726527, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.5823123455047607 + }, + { + "auxiliary_loss_clip": 0.01115508, + "auxiliary_loss_mlp": 0.01106305, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00063837, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 2.0397087413272446, + "language_loss": 0.81982106, + "learning_rate": 6.455725902183813e-07, + "loss": 0.84203923, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.6510300636291504 + }, + { + "auxiliary_loss_clip": 0.01150731, + "auxiliary_loss_mlp": 0.01104261, + "balance_loss_clip": 1.00200832, + "balance_loss_mlp": 1.0005008, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.5577230533315376, + "language_loss": 0.71078479, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73333472, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.5650177001953125 + }, + { + "auxiliary_loss_clip": 0.011154, + "auxiliary_loss_mlp": 0.01104358, + "balance_loss_clip": 1.00161123, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 3.517967823476518, + "language_loss": 0.70095015, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72314763, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.608241558074951 + }, + { + "auxiliary_loss_clip": 0.01150717, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00042439, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5335834528181593, + "language_loss": 0.84754658, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87010038, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.602855682373047 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_clip": 1.00163305, + "balance_loss_mlp": 1.00055206, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 3.5129514291733708, + "language_loss": 0.79075444, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81281841, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.727961778640747 + }, + { + "auxiliary_loss_clip": 0.01134168, + "auxiliary_loss_mlp": 0.01104653, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00041676, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.8055090491403625, + "language_loss": 0.8508473, + "learning_rate": 6.441404294400014e-07, + "loss": 0.87323552, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 5.544491291046143 + }, + { + "auxiliary_loss_clip": 0.01165517, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.00197446, + "balance_loss_mlp": 1.00049162, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 2.092033697467044, + "language_loss": 0.73837078, + "learning_rate": 6.438541514838811e-07, + "loss": 0.76106846, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.496922016143799 + }, + { + "auxiliary_loss_clip": 0.01148712, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_clip": 1.00190628, + "balance_loss_mlp": 1.00060463, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.5389668126258054, + "language_loss": 0.7637713, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78629726, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.583357572555542 + }, + { + "auxiliary_loss_clip": 0.01148687, + "auxiliary_loss_mlp": 0.01105302, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00058889, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 2.1342232734891864, + "language_loss": 0.72316778, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74570763, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.5393683910369873 + }, + { + "auxiliary_loss_clip": 0.01083722, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.0016892, + "balance_loss_mlp": 1.00046849, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.5636008084910948, + "language_loss": 0.81407541, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83238614, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 4.125861406326294 + }, + { + "auxiliary_loss_clip": 0.01148712, + "auxiliary_loss_mlp": 0.01104744, + "balance_loss_clip": 1.00178063, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 2.110255659800076, + "language_loss": 0.71402931, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73656392, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.5723793506622314 + }, + { + "auxiliary_loss_clip": 0.01100529, + "auxiliary_loss_mlp": 0.01105625, + "balance_loss_clip": 1.00173032, + "balance_loss_mlp": 1.00043464, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 2.312193386807051, + "language_loss": 0.67559904, + "learning_rate": 6.424235332981245e-07, + "loss": 0.69766057, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.707448959350586 + }, + { + "auxiliary_loss_clip": 0.01165364, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00062919, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 1.6658229364606478, + "language_loss": 0.76667738, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7893759, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.480861186981201 + }, + { + "auxiliary_loss_clip": 0.01148701, + "auxiliary_loss_mlp": 0.01104099, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00043511, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.5942222404667528, + "language_loss": 0.77679121, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79931921, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 2.5335633754730225 + }, + { + "auxiliary_loss_clip": 0.01135971, + "auxiliary_loss_mlp": 0.01104635, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00058913, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.8650698713598326, + "language_loss": 0.73925459, + "learning_rate": 6.415657800531038e-07, + "loss": 0.7616607, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.542222499847412 + }, + { + "auxiliary_loss_clip": 0.01148774, + "auxiliary_loss_mlp": 0.01105271, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00055766, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.6900622416679256, + "language_loss": 0.81774783, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84028828, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.620600461959839 + }, + { + "auxiliary_loss_clip": 0.01115298, + "auxiliary_loss_mlp": 0.01104214, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00054979, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.7928496289556903, + "language_loss": 0.6462009, + "learning_rate": 6.409942020981611e-07, + "loss": 0.668396, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.627476215362549 + }, + { + "auxiliary_loss_clip": 0.01118553, + "auxiliary_loss_mlp": 0.01103826, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00054336, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.5622786517047549, + "language_loss": 0.73127753, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75350142, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.766573190689087 + }, + { + "auxiliary_loss_clip": 0.01110657, + "auxiliary_loss_mlp": 0.01081676, + "balance_loss_clip": 1.00111639, + "balance_loss_mlp": 1.00004125, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.9576379904038758, + "language_loss": 0.58821678, + "learning_rate": 6.404228302777621e-07, + "loss": 0.61014009, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 3.0333781242370605 + }, + { + "auxiliary_loss_clip": 0.01165431, + "auxiliary_loss_mlp": 0.01103925, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00054622, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.5965588869798806, + "language_loss": 0.77749562, + "learning_rate": 6.401372216950995e-07, + "loss": 0.80018914, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.5212509632110596 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01104631, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00058472, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.599789588445477, + "language_loss": 0.69251621, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71492326, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.5755083560943604 + }, + { + "auxiliary_loss_clip": 0.0106805, + "auxiliary_loss_mlp": 0.01105403, + "balance_loss_clip": 1.00166297, + "balance_loss_mlp": 1.00059426, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.6840895430350389, + "language_loss": 0.65021074, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67194533, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 2.715019464492798 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.01105599, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00059915, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.9116368555466388, + "language_loss": 0.72223806, + "learning_rate": 6.392807053872212e-07, + "loss": 0.7444731, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.6450843811035156 + }, + { + "auxiliary_loss_clip": 0.01150545, + "auxiliary_loss_mlp": 0.01105244, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00053096, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.6434798761264513, + "language_loss": 0.72640949, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74896741, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.556166172027588 + }, + { + "auxiliary_loss_clip": 0.01150667, + "auxiliary_loss_mlp": 0.01103556, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00055885, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.66696090013314, + "language_loss": 0.66167855, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68422073, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.551373243331909 + }, + { + "auxiliary_loss_clip": 0.01148941, + "auxiliary_loss_mlp": 0.00747459, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00051951, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.096316121368659, + "language_loss": 0.84281623, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86178023, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.545264720916748 + }, + { + "auxiliary_loss_clip": 0.01117508, + "auxiliary_loss_mlp": 0.01103859, + "balance_loss_clip": 1.00164962, + "balance_loss_mlp": 1.0003854, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.5650309240118547, + "language_loss": 0.77855885, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80077243, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.6648263931274414 + }, + { + "auxiliary_loss_clip": 0.01119572, + "auxiliary_loss_mlp": 0.01104361, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00060105, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.7399527436694657, + "language_loss": 0.62426591, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64650524, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.7169525623321533 + }, + { + "auxiliary_loss_clip": 0.01146584, + "auxiliary_loss_mlp": 0.00746456, + "balance_loss_clip": 1.00133252, + "balance_loss_mlp": 1.00097609, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7164759496726628, + "language_loss": 0.54858339, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56751382, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.1544957160949707 + }, + { + "auxiliary_loss_clip": 0.01133985, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00043559, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.5252316179050267, + "language_loss": 0.55181497, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57420534, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.7062411308288574 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01104712, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.00047565, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.707236227524485, + "language_loss": 0.75040591, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77228796, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.744877815246582 + }, + { + "auxiliary_loss_clip": 0.01102254, + "auxiliary_loss_mlp": 0.01104588, + "balance_loss_clip": 1.00166535, + "balance_loss_mlp": 1.00063717, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.5186912847984173, + "language_loss": 0.69496512, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71703357, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.8524599075317383 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01104924, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00049675, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.6596151927880698, + "language_loss": 0.73763716, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75985658, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.623417377471924 + }, + { + "auxiliary_loss_clip": 0.01148658, + "auxiliary_loss_mlp": 0.01104679, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.0005374, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.6477784810586864, + "language_loss": 0.69219434, + "learning_rate": 6.361441209060039e-07, + "loss": 0.7147277, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.5511109828948975 + }, + { + "auxiliary_loss_clip": 0.01165353, + "auxiliary_loss_mlp": 0.01104124, + "balance_loss_clip": 1.00198042, + "balance_loss_mlp": 1.00055492, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.9227767575063293, + "language_loss": 0.74638188, + "learning_rate": 6.358592869514216e-07, + "loss": 0.7690767, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 3.9312520027160645 + }, + { + "auxiliary_loss_clip": 0.01150852, + "auxiliary_loss_mlp": 0.01105609, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.0004189, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.890473594580194, + "language_loss": 0.67247313, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69503778, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 2.5370376110076904 + }, + { + "auxiliary_loss_clip": 0.01132444, + "auxiliary_loss_mlp": 0.01106255, + "balance_loss_clip": 1.00165522, + "balance_loss_mlp": 1.00058782, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.5505775771247883, + "language_loss": 0.72641742, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74880445, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.615009307861328 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01105502, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00059748, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 1.8825437807975636, + "language_loss": 0.74901575, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77124107, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.6841700077056885 + }, + { + "auxiliary_loss_clip": 0.0114878, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00036478, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.2849987333412094, + "language_loss": 0.67598796, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69851315, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.545767307281494 + }, + { + "auxiliary_loss_clip": 0.01150911, + "auxiliary_loss_mlp": 0.01104905, + "balance_loss_clip": 1.00206828, + "balance_loss_mlp": 1.00047791, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.8948709168813993, + "language_loss": 0.74409783, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76665604, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.6658759117126465 + }, + { + "auxiliary_loss_clip": 0.01116989, + "auxiliary_loss_mlp": 0.01104666, + "balance_loss_clip": 1.00168157, + "balance_loss_mlp": 1.00042987, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 1.7836765375892263, + "language_loss": 0.69327211, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71548867, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.6192283630371094 + }, + { + "auxiliary_loss_clip": 0.01114543, + "auxiliary_loss_mlp": 0.01104637, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00059092, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.3614595697199108, + "language_loss": 0.65341139, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67560321, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.628622055053711 + }, + { + "auxiliary_loss_clip": 0.01165478, + "auxiliary_loss_mlp": 0.01104595, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00054932, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.6063402012253853, + "language_loss": 0.74854994, + "learning_rate": 6.335824784423118e-07, + "loss": 0.77125067, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 3.9343128204345703 + }, + { + "auxiliary_loss_clip": 0.01150423, + "auxiliary_loss_mlp": 0.01106271, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.8312520718038026, + "language_loss": 0.58243167, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60499859, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 4.04417085647583 + }, + { + "auxiliary_loss_clip": 0.01149118, + "auxiliary_loss_mlp": 0.01105939, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00055826, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8611163911277044, + "language_loss": 0.60280299, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62535357, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.595033884048462 + }, + { + "auxiliary_loss_clip": 0.01135469, + "auxiliary_loss_mlp": 0.01104471, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00061607, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.3522272078192716, + "language_loss": 0.75626993, + "learning_rate": 6.327295298970734e-07, + "loss": 0.7786693, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 2.632550001144409 + }, + { + "auxiliary_loss_clip": 0.01148777, + "auxiliary_loss_mlp": 0.01105542, + "balance_loss_clip": 1.00185525, + "balance_loss_mlp": 1.00044727, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.7409950533807468, + "language_loss": 0.75245237, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77499551, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 2.4939441680908203 + }, + { + "auxiliary_loss_clip": 0.0113243, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.00173736, + "balance_loss_mlp": 1.00043237, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.6361794900500373, + "language_loss": 0.70079756, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72318572, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 3.9331977367401123 + }, + { + "auxiliary_loss_clip": 0.01100615, + "auxiliary_loss_mlp": 0.01104993, + "balance_loss_clip": 1.00178778, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.638087953969301, + "language_loss": 0.67532116, + "learning_rate": 6.318770479751232e-07, + "loss": 0.6973772, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.6687703132629395 + }, + { + "auxiliary_loss_clip": 0.01165229, + "auxiliary_loss_mlp": 0.01103238, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00052774, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.782505724994788, + "language_loss": 0.79554248, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81822717, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.5390024185180664 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.9426514274603652, + "language_loss": 0.67831838, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70054841, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.723511219024658 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.01105312, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00050318, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.8503474549007113, + "language_loss": 0.70518124, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72744656, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.725504159927368 + }, + { + "auxiliary_loss_clip": 0.01118146, + "auxiliary_loss_mlp": 0.01103687, + "balance_loss_clip": 1.00170231, + "balance_loss_mlp": 1.00040412, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.701185625997315, + "language_loss": 0.67242593, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69464421, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.6292364597320557 + }, + { + "auxiliary_loss_clip": 0.01134029, + "auxiliary_loss_mlp": 0.01104131, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00056171, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6452738292535345, + "language_loss": 0.80944335, + "learning_rate": 6.304572825026344e-07, + "loss": 0.8318249, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.5479495525360107 + }, + { + "auxiliary_loss_clip": 0.01117103, + "auxiliary_loss_mlp": 0.01104181, + "balance_loss_clip": 1.00167894, + "balance_loss_mlp": 1.0006125, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 1.9051650956972006, + "language_loss": 0.71013594, + "learning_rate": 6.301734851646674e-07, + "loss": 0.7323488, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.6626758575439453 + }, + { + "auxiliary_loss_clip": 0.01133234, + "auxiliary_loss_mlp": 0.0110384, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00046158, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.536715818486792, + "language_loss": 0.74227464, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76464534, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.6541593074798584 + }, + { + "auxiliary_loss_clip": 0.01149132, + "auxiliary_loss_mlp": 0.00747517, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.0004797, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.094921792661135, + "language_loss": 0.823632, + "learning_rate": 6.296060463313698e-07, + "loss": 0.8425985, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.5363969802856445 + }, + { + "auxiliary_loss_clip": 0.01085544, + "auxiliary_loss_mlp": 0.01106054, + "balance_loss_clip": 1.00181162, + "balance_loss_mlp": 1.00057745, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.7882331168852295, + "language_loss": 0.62729502, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64921099, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.7593891620635986 + }, + { + "auxiliary_loss_clip": 0.01117343, + "auxiliary_loss_mlp": 0.01104002, + "balance_loss_clip": 1.00156808, + "balance_loss_mlp": 1.0003376, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.174860246608478, + "language_loss": 0.71241188, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73462528, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.594937801361084 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.0110522, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.0005064, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.3796547269064297, + "language_loss": 0.68579113, + "learning_rate": 6.287552778493786e-07, + "loss": 0.70787281, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.6688530445098877 + }, + { + "auxiliary_loss_clip": 0.01148722, + "auxiliary_loss_mlp": 0.011042, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00044012, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.6242385130761048, + "language_loss": 0.74358594, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76611519, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.5354199409484863 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00049829, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.984600522479892, + "language_loss": 0.73137927, + "learning_rate": 6.281883588321475e-07, + "loss": 0.750193, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.5632004737854004 + }, + { + "auxiliary_loss_clip": 0.01101434, + "auxiliary_loss_mlp": 0.01104444, + "balance_loss_clip": 1.00163007, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.3557081044991284, + "language_loss": 0.72252053, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74457932, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.712738513946533 + }, + { + "auxiliary_loss_clip": 0.01165476, + "auxiliary_loss_mlp": 0.01104902, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00066543, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.7684183124439603, + "language_loss": 0.73616803, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75887179, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.557910680770874 + }, + { + "auxiliary_loss_clip": 0.01115273, + "auxiliary_loss_mlp": 0.01105638, + "balance_loss_clip": 1.0019387, + "balance_loss_mlp": 1.000543, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 2.446076409271684, + "language_loss": 0.61175907, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63396817, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.6434357166290283 + }, + { + "auxiliary_loss_clip": 0.01165236, + "auxiliary_loss_mlp": 0.01104247, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00048685, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 1.9645878958214495, + "language_loss": 0.70414954, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72684437, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.5381126403808594 + }, + { + "auxiliary_loss_clip": 0.01150447, + "auxiliary_loss_mlp": 0.01105676, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00048518, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.29617757664221, + "language_loss": 0.80558735, + "learning_rate": 6.267719718136988e-07, + "loss": 0.8281486, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.566232442855835 + }, + { + "auxiliary_loss_clip": 0.01165869, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00055528, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 2.0656362627243747, + "language_loss": 0.71386772, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73658764, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.492507219314575 + }, + { + "auxiliary_loss_clip": 0.01132241, + "auxiliary_loss_mlp": 0.01105069, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00054586, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.5313536710510522, + "language_loss": 0.73915493, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76152802, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.5870704650878906 + }, + { + "auxiliary_loss_clip": 0.01125744, + "auxiliary_loss_mlp": 0.01081633, + "balance_loss_clip": 1.00131059, + "balance_loss_mlp": 0.99999839, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7330364267295026, + "language_loss": 0.59425575, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61632949, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.315354347229004 + }, + { + "auxiliary_loss_clip": 0.01119211, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00040579, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 1.6589582839511146, + "language_loss": 0.79606003, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81829286, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 2.6053967475891113 + }, + { + "auxiliary_loss_clip": 0.01146265, + "auxiliary_loss_mlp": 0.01081697, + "balance_loss_clip": 1.00132942, + "balance_loss_mlp": 1.00006223, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8367982416615676, + "language_loss": 0.61391187, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63619149, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 4.403641700744629 + }, + { + "auxiliary_loss_clip": 0.01135239, + "auxiliary_loss_mlp": 0.0110497, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00054288, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 3.4179621658759785, + "language_loss": 0.67625749, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69865954, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.524110794067383 + }, + { + "auxiliary_loss_clip": 0.01100847, + "auxiliary_loss_mlp": 0.011045, + "balance_loss_clip": 1.00158525, + "balance_loss_mlp": 1.00054932, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 2.35881019982874, + "language_loss": 0.79933071, + "learning_rate": 6.247912173519106e-07, + "loss": 0.82138419, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.6717379093170166 + }, + { + "auxiliary_loss_clip": 0.01119522, + "auxiliary_loss_mlp": 0.0110441, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00064993, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.7968682858008698, + "language_loss": 0.80498248, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82722181, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.664719820022583 + }, + { + "auxiliary_loss_clip": 0.01134316, + "auxiliary_loss_mlp": 0.01104941, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00051403, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.84149411860959, + "language_loss": 0.85943592, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88182843, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.61334490776062 + }, + { + "auxiliary_loss_clip": 0.01148975, + "auxiliary_loss_mlp": 0.01104395, + "balance_loss_clip": 1.00200808, + "balance_loss_mlp": 1.00044489, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 1.8323114913089766, + "language_loss": 0.69786477, + "learning_rate": 6.239431045888435e-07, + "loss": 0.72039843, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.586684226989746 + }, + { + "auxiliary_loss_clip": 0.01165452, + "auxiliary_loss_mlp": 0.0110486, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.1306968320564814, + "language_loss": 0.70376205, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72646517, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.553853750228882 + }, + { + "auxiliary_loss_clip": 0.01117291, + "auxiliary_loss_mlp": 0.01104417, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00056219, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.6754429588302369, + "language_loss": 0.77661306, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79883009, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.689194917678833 + }, + { + "auxiliary_loss_clip": 0.01131895, + "auxiliary_loss_mlp": 0.01103377, + "balance_loss_clip": 1.00164902, + "balance_loss_mlp": 1.00047529, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6365665670807843, + "language_loss": 0.78274459, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80509728, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.5826265811920166 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_clip": 1.00178993, + "balance_loss_mlp": 1.00052035, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.142421588101438, + "language_loss": 0.74127895, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76351374, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 5.41890025138855 + }, + { + "auxiliary_loss_clip": 0.01151017, + "auxiliary_loss_mlp": 0.01104898, + "balance_loss_clip": 1.00200891, + "balance_loss_mlp": 1.00056624, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.3255028435249336, + "language_loss": 0.66514438, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68770349, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.56011962890625 + }, + { + "auxiliary_loss_clip": 0.01101479, + "auxiliary_loss_mlp": 0.00747423, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00046599, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 3.6905576746903495, + "language_loss": 0.76394618, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78243518, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.6529178619384766 + }, + { + "auxiliary_loss_clip": 0.01116509, + "auxiliary_loss_mlp": 0.01104563, + "balance_loss_clip": 1.00163186, + "balance_loss_mlp": 1.00042212, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 1.9502376679015738, + "language_loss": 0.69397515, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71618593, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.6283957958221436 + }, + { + "auxiliary_loss_clip": 0.01133715, + "auxiliary_loss_mlp": 0.01105081, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00046253, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 2.05489838447687, + "language_loss": 0.69454491, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71693289, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 4.035565614700317 + }, + { + "auxiliary_loss_clip": 0.01130864, + "auxiliary_loss_mlp": 0.01105765, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00057483, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 1.8421354911153647, + "language_loss": 0.75363028, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77599663, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 2.5739574432373047 + }, + { + "auxiliary_loss_clip": 0.01133712, + "auxiliary_loss_mlp": 0.01105146, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.2139658498364305, + "language_loss": 0.76889348, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79128206, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.5482587814331055 + }, + { + "auxiliary_loss_clip": 0.01148722, + "auxiliary_loss_mlp": 0.00747503, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00052559, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.5120869828344812, + "language_loss": 0.8454091, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86437142, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.549804449081421 + }, + { + "auxiliary_loss_clip": 0.01119459, + "auxiliary_loss_mlp": 0.01105772, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00048602, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.992223134104986, + "language_loss": 0.7370851, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75933748, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.615126132965088 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01105949, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00056839, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 2.304380161008438, + "language_loss": 0.74595308, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7683531, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.576324224472046 + }, + { + "auxiliary_loss_clip": 0.01150949, + "auxiliary_loss_mlp": 0.01106723, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00077033, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 2.196721470980198, + "language_loss": 0.80481368, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82739043, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.522784948348999 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00050974, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.9682478261176741, + "language_loss": 0.77967656, + "learning_rate": 6.19709590885688e-07, + "loss": 0.80190086, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.6339542865753174 + }, + { + "auxiliary_loss_clip": 0.01129299, + "auxiliary_loss_mlp": 0.01081302, + "balance_loss_clip": 1.00112796, + "balance_loss_mlp": 1.00004911, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8163662612738655, + "language_loss": 0.54462099, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56672704, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 3.1336252689361572 + }, + { + "auxiliary_loss_clip": 0.01134106, + "auxiliary_loss_mlp": 0.01104579, + "balance_loss_clip": 1.0017041, + "balance_loss_mlp": 1.0005331, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.6720866624671378, + "language_loss": 0.80278695, + "learning_rate": 6.191460113968272e-07, + "loss": 0.8251738, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.5824527740478516 + }, + { + "auxiliary_loss_clip": 0.01148997, + "auxiliary_loss_mlp": 0.01105589, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00058937, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 4.618563245890064, + "language_loss": 0.62976229, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65230823, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.5601871013641357 + }, + { + "auxiliary_loss_clip": 0.01134021, + "auxiliary_loss_mlp": 0.01104165, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00078654, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.7597039921007338, + "language_loss": 0.77861434, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80099624, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.588759183883667 + }, + { + "auxiliary_loss_clip": 0.01119951, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_clip": 1.00191557, + "balance_loss_mlp": 1.00049376, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.6035185567446653, + "language_loss": 0.71387112, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73612177, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.6670262813568115 + }, + { + "auxiliary_loss_clip": 0.01165498, + "auxiliary_loss_mlp": 0.0110499, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00065851, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.645781144496748, + "language_loss": 0.70033091, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72303581, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.570377826690674 + }, + { + "auxiliary_loss_clip": 0.01165412, + "auxiliary_loss_mlp": 0.01104601, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00045991, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.9970412663869013, + "language_loss": 0.7403053, + "learning_rate": 6.177379791987131e-07, + "loss": 0.7630055, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.517472982406616 + }, + { + "auxiliary_loss_clip": 0.01132951, + "auxiliary_loss_mlp": 0.01104081, + "balance_loss_clip": 1.00176191, + "balance_loss_mlp": 1.00041652, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 1.8541313536530724, + "language_loss": 0.84535116, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86772156, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.5628585815429688 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.0110462, + "balance_loss_clip": 1.00163293, + "balance_loss_mlp": 1.0004791, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.3916043512033371, + "language_loss": 0.78144526, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80366361, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.625070333480835 + }, + { + "auxiliary_loss_clip": 0.01150788, + "auxiliary_loss_mlp": 0.01105353, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.0004493, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.153036695849775, + "language_loss": 0.72847658, + "learning_rate": 6.168937887805932e-07, + "loss": 0.75103801, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.5725743770599365 + }, + { + "auxiliary_loss_clip": 0.01133961, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00050783, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9187937099160066, + "language_loss": 0.67434502, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69673491, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.728144645690918 + }, + { + "auxiliary_loss_clip": 0.01084528, + "auxiliary_loss_mlp": 0.01104456, + "balance_loss_clip": 1.00159144, + "balance_loss_mlp": 1.00060129, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.5556084747047494, + "language_loss": 0.77258241, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79447228, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.66556978225708 + }, + { + "auxiliary_loss_clip": 0.01148645, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00048661, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 2.5176305829513557, + "language_loss": 0.75302476, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77556032, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.5929689407348633 + }, + { + "auxiliary_loss_clip": 0.01165448, + "auxiliary_loss_mlp": 0.01105052, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00052929, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 2.391402341156005, + "language_loss": 0.78147793, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80418289, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.503227949142456 + }, + { + "auxiliary_loss_clip": 0.01150717, + "auxiliary_loss_mlp": 0.01105167, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00054884, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.740377948712503, + "language_loss": 0.76675636, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78931516, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.5619583129882812 + }, + { + "auxiliary_loss_clip": 0.01118323, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_clip": 1.00168979, + "balance_loss_mlp": 1.00045693, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.7900956718265049, + "language_loss": 0.70814675, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73037118, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.650921106338501 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00040555, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.857545154098358, + "language_loss": 0.80787003, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82682729, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 2.554436206817627 + }, + { + "auxiliary_loss_clip": 0.01165492, + "auxiliary_loss_mlp": 0.01105287, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00047851, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 1.6854890102715254, + "language_loss": 0.78314352, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80585134, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 3.9414594173431396 + }, + { + "auxiliary_loss_clip": 0.0116539, + "auxiliary_loss_mlp": 0.00747384, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00039601, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 1.982018231893627, + "language_loss": 0.70760727, + "learning_rate": 6.143640508441898e-07, + "loss": 0.726735, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.587646961212158 + }, + { + "auxiliary_loss_clip": 0.01102062, + "auxiliary_loss_mlp": 0.01103427, + "balance_loss_clip": 1.0017246, + "balance_loss_mlp": 1.00052547, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.6404953322684073, + "language_loss": 0.78083062, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80288553, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.7496612071990967 + }, + { + "auxiliary_loss_clip": 0.01148738, + "auxiliary_loss_mlp": 0.01104828, + "balance_loss_clip": 1.00181544, + "balance_loss_mlp": 1.00059128, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.4587781661298156, + "language_loss": 0.76724494, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78978062, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.6489319801330566 + }, + { + "auxiliary_loss_clip": 0.01134139, + "auxiliary_loss_mlp": 0.01103988, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00060987, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.7501247820701817, + "language_loss": 0.74187839, + "learning_rate": 6.135217502639878e-07, + "loss": 0.7642597, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 2.6641345024108887 + }, + { + "auxiliary_loss_clip": 0.01150106, + "auxiliary_loss_mlp": 0.01103953, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00047886, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.475862803640723, + "language_loss": 0.7941162, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81665683, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.660876750946045 + }, + { + "auxiliary_loss_clip": 0.01150237, + "auxiliary_loss_mlp": 0.01107085, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 1.9861191842046437, + "language_loss": 0.73595691, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75853014, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.6423375606536865 + }, + { + "auxiliary_loss_clip": 0.01133453, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_clip": 1.00171208, + "balance_loss_mlp": 1.00039351, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.753959458160237, + "language_loss": 0.78565365, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80802494, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.655014753341675 + }, + { + "auxiliary_loss_clip": 0.01132001, + "auxiliary_loss_mlp": 0.01104372, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.00061226, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.1501172297247764, + "language_loss": 0.70437676, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72674048, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.6258654594421387 + }, + { + "auxiliary_loss_clip": 0.01161159, + "auxiliary_loss_mlp": 0.01081295, + "balance_loss_clip": 1.00139546, + "balance_loss_mlp": 1.00004184, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9839016474648886, + "language_loss": 0.63976693, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66219139, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 4.522782802581787 + }, + { + "auxiliary_loss_clip": 0.01118869, + "auxiliary_loss_mlp": 0.01103911, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.3771655587410585, + "language_loss": 0.68498629, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70721412, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 4.225619792938232 + }, + { + "auxiliary_loss_clip": 0.01144042, + "auxiliary_loss_mlp": 0.00746326, + "balance_loss_clip": 1.0014019, + "balance_loss_mlp": 1.0007385, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6357581400761945, + "language_loss": 0.5507074, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56961101, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.2536585330963135 + }, + { + "auxiliary_loss_clip": 0.0115086, + "auxiliary_loss_mlp": 0.01105302, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00058866, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 1.6809198550643862, + "language_loss": 0.78051031, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80307186, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.628629207611084 + }, + { + "auxiliary_loss_clip": 0.0113372, + "auxiliary_loss_mlp": 0.01104722, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00058067, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.528671092948107, + "language_loss": 0.71272337, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73510778, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 4.00508451461792 + }, + { + "auxiliary_loss_clip": 0.01150775, + "auxiliary_loss_mlp": 0.01104333, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00057268, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.6764835518579548, + "language_loss": 0.71996522, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74251628, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.5097548961639404 + }, + { + "auxiliary_loss_clip": 0.0116555, + "auxiliary_loss_mlp": 0.01106233, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00066113, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.6399909892912674, + "language_loss": 0.61987245, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64259028, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.5377533435821533 + }, + { + "auxiliary_loss_clip": 0.01148864, + "auxiliary_loss_mlp": 0.01104551, + "balance_loss_clip": 1.00202906, + "balance_loss_mlp": 1.00060081, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.8932592031755822, + "language_loss": 0.8144092, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83694339, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 2.546607255935669 + }, + { + "auxiliary_loss_clip": 0.01133948, + "auxiliary_loss_mlp": 0.01105823, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.000633, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.6496653040408495, + "language_loss": 0.75912368, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78152132, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.5411789417266846 + }, + { + "auxiliary_loss_clip": 0.01148867, + "auxiliary_loss_mlp": 0.01103499, + "balance_loss_clip": 1.00181448, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.6050233553457747, + "language_loss": 0.82366633, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84618998, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.5608878135681152 + }, + { + "auxiliary_loss_clip": 0.01149034, + "auxiliary_loss_mlp": 0.01105403, + "balance_loss_clip": 1.0017283, + "balance_loss_mlp": 1.0005939, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 2.210820776818386, + "language_loss": 0.74854219, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77108657, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.5388071537017822 + }, + { + "auxiliary_loss_clip": 0.01148853, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00048649, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 2.790867528681644, + "language_loss": 0.6908704, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71340322, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.5135884284973145 + }, + { + "auxiliary_loss_clip": 0.01148601, + "auxiliary_loss_mlp": 0.01103973, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.00059485, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.7187642244008987, + "language_loss": 0.69931316, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72183895, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.6043686866760254 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01103878, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00049949, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.418377857917288, + "language_loss": 0.89355791, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91559899, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.7027814388275146 + }, + { + "auxiliary_loss_clip": 0.01132049, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00058794, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.4987819923212793, + "language_loss": 0.74148035, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76385188, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 2.5935025215148926 + }, + { + "auxiliary_loss_clip": 0.01100782, + "auxiliary_loss_mlp": 0.01082516, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00011849, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7042302214712433, + "language_loss": 0.55689251, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57872546, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.2929155826568604 + }, + { + "auxiliary_loss_clip": 0.01149033, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00051522, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.9162040681450265, + "language_loss": 0.7761699, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79870677, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.562260389328003 + }, + { + "auxiliary_loss_clip": 0.0115087, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00066161, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 4.177948214992031, + "language_loss": 0.74479997, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76735008, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.5549421310424805 + }, + { + "auxiliary_loss_clip": 0.01130908, + "auxiliary_loss_mlp": 0.0110662, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00057173, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.7583767123816234, + "language_loss": 0.66970313, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69207841, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.644343852996826 + }, + { + "auxiliary_loss_clip": 0.01148877, + "auxiliary_loss_mlp": 0.0110511, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00077796, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.4590103078156091, + "language_loss": 0.78170705, + "learning_rate": 6.068004041624453e-07, + "loss": 0.8042469, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.5724892616271973 + }, + { + "auxiliary_loss_clip": 0.01165472, + "auxiliary_loss_mlp": 0.01104287, + "balance_loss_clip": 1.00202835, + "balance_loss_mlp": 1.00052762, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.9609223516211556, + "language_loss": 0.80416507, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82686269, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.505614995956421 + }, + { + "auxiliary_loss_clip": 0.01150592, + "auxiliary_loss_mlp": 0.00747367, + "balance_loss_clip": 1.00191557, + "balance_loss_mlp": 1.00048363, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.6566286757371997, + "language_loss": 0.74087399, + "learning_rate": 6.062416635517326e-07, + "loss": 0.7598536, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.572242498397827 + }, + { + "auxiliary_loss_clip": 0.0111667, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.00054276, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.7380446630459148, + "language_loss": 0.72131562, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74352908, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.6819403171539307 + }, + { + "auxiliary_loss_clip": 0.01133428, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_clip": 1.00176239, + "balance_loss_mlp": 1.00033569, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 3.261411721492605, + "language_loss": 0.72195566, + "learning_rate": 6.056831343468414e-07, + "loss": 0.7443251, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.6593215465545654 + }, + { + "auxiliary_loss_clip": 0.01115205, + "auxiliary_loss_mlp": 0.01103575, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00048316, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.9508412111260658, + "language_loss": 0.81174433, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83393216, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.596857786178589 + }, + { + "auxiliary_loss_clip": 0.01087603, + "auxiliary_loss_mlp": 0.0110488, + "balance_loss_clip": 1.00167918, + "balance_loss_mlp": 1.00054836, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 2.548812101269014, + "language_loss": 0.85167289, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87359774, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.686272621154785 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.01105907, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00062108, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 2.0620139926890517, + "language_loss": 0.74145955, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76368624, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.6401867866516113 + }, + { + "auxiliary_loss_clip": 0.01096441, + "auxiliary_loss_mlp": 0.01082222, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00020564, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8234263074814723, + "language_loss": 0.63586473, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65765136, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.106863498687744 + }, + { + "auxiliary_loss_clip": 0.01132156, + "auxiliary_loss_mlp": 0.01105354, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00045013, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 1.90066801700682, + "language_loss": 0.70229298, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72466803, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 4.013644218444824 + }, + { + "auxiliary_loss_clip": 0.0113179, + "auxiliary_loss_mlp": 0.01103762, + "balance_loss_clip": 1.001966, + "balance_loss_mlp": 1.00047874, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.6247587963414452, + "language_loss": 0.77602607, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79838163, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.586789846420288 + }, + { + "auxiliary_loss_clip": 0.01146639, + "auxiliary_loss_mlp": 0.01081612, + "balance_loss_clip": 1.00141478, + "balance_loss_mlp": 0.99997789, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7810964788044773, + "language_loss": 0.57322562, + "learning_rate": 6.037299481733886e-07, + "loss": 0.5955081, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.1660354137420654 + }, + { + "auxiliary_loss_clip": 0.0113399, + "auxiliary_loss_mlp": 0.0110449, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00044417, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.3573023348772961, + "language_loss": 0.71383494, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73621976, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 2.6434502601623535 + }, + { + "auxiliary_loss_clip": 0.01133807, + "auxiliary_loss_mlp": 0.01104602, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.00046086, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.7011803235449177, + "language_loss": 0.80731332, + "learning_rate": 6.031723713426135e-07, + "loss": 0.82969743, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.622758388519287 + }, + { + "auxiliary_loss_clip": 0.01131853, + "auxiliary_loss_mlp": 0.01104357, + "balance_loss_clip": 1.0019908, + "balance_loss_mlp": 1.00059676, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 2.046442954014753, + "language_loss": 0.7495712, + "learning_rate": 6.028936623737067e-07, + "loss": 0.77193332, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.654325246810913 + }, + { + "auxiliary_loss_clip": 0.01165528, + "auxiliary_loss_mlp": 0.01105016, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00058842, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 2.559956645562177, + "language_loss": 0.73867333, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76137877, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.4805102348327637 + }, + { + "auxiliary_loss_clip": 0.01115124, + "auxiliary_loss_mlp": 0.01104781, + "balance_loss_clip": 1.00175273, + "balance_loss_mlp": 1.00054407, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.5414776223073816, + "language_loss": 0.67686087, + "learning_rate": 6.023364033816956e-07, + "loss": 0.6990599, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.6581101417541504 + }, + { + "auxiliary_loss_clip": 0.0116533, + "auxiliary_loss_mlp": 0.01103504, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00041199, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.5911621171217176, + "language_loss": 0.74599648, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76868474, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.5005335807800293 + }, + { + "auxiliary_loss_clip": 0.01165595, + "auxiliary_loss_mlp": 0.01104423, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00056779, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.168234537681792, + "language_loss": 0.72039509, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74309534, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 3.8631556034088135 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.0110395, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00047636, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.7549667553822585, + "language_loss": 0.72364855, + "learning_rate": 6.015009124166576e-07, + "loss": 0.7463423, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 4.1037821769714355 + }, + { + "auxiliary_loss_clip": 0.01134033, + "auxiliary_loss_mlp": 0.01104328, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00037742, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.8921664755227148, + "language_loss": 0.84794205, + "learning_rate": 6.012225214766844e-07, + "loss": 0.87032568, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.562731981277466 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01104864, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00043631, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.0217101453612947, + "language_loss": 0.73403388, + "learning_rate": 6.009441835784927e-07, + "loss": 0.7562319, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.6980350017547607 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.0110416, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00059104, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 1.8938217591691526, + "language_loss": 0.67841601, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70094478, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 3.965726137161255 + }, + { + "auxiliary_loss_clip": 0.01135927, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00045383, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.6472561481530668, + "language_loss": 0.68680429, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70920765, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.5811896324157715 + }, + { + "auxiliary_loss_clip": 0.0114877, + "auxiliary_loss_mlp": 0.01105803, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00061274, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.0170056946260124, + "language_loss": 0.73169512, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75424081, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.5878570079803467 + }, + { + "auxiliary_loss_clip": 0.01165279, + "auxiliary_loss_mlp": 0.01104421, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 5.489465911362603, + "language_loss": 0.67771089, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70040786, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.5238840579986572 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01104893, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00056064, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.769704941854283, + "language_loss": 0.87253797, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89491069, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.546330690383911 + }, + { + "auxiliary_loss_clip": 0.01100277, + "auxiliary_loss_mlp": 0.01103139, + "balance_loss_clip": 1.00165796, + "balance_loss_mlp": 1.00042844, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.7668689367522292, + "language_loss": 0.77237689, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79441106, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 2.682143211364746 + }, + { + "auxiliary_loss_clip": 0.01165493, + "auxiliary_loss_mlp": 0.01104412, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.00046158, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.8171645624507546, + "language_loss": 0.69786555, + "learning_rate": 5.98997304347386e-07, + "loss": 0.7205646, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.5503480434417725 + }, + { + "auxiliary_loss_clip": 0.01134209, + "auxiliary_loss_mlp": 0.01104889, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00055671, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 1.922287719736112, + "language_loss": 0.86368525, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88607621, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.553785562515259 + }, + { + "auxiliary_loss_clip": 0.01150302, + "auxiliary_loss_mlp": 0.01104818, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00048602, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.7182051715488569, + "language_loss": 0.78019083, + "learning_rate": 5.98441531115812e-07, + "loss": 0.802742, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.5595102310180664 + }, + { + "auxiliary_loss_clip": 0.01148684, + "auxiliary_loss_mlp": 0.01105681, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00049043, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 1.9636429242063738, + "language_loss": 0.6285888, + "learning_rate": 5.981637242156135e-07, + "loss": 0.65113246, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.6185390949249268 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00058937, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.5047966987354662, + "language_loss": 0.73492074, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75731456, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.6220712661743164 + }, + { + "auxiliary_loss_clip": 0.01132468, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00049806, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.7348806836322195, + "language_loss": 0.78703427, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80941004, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.571275472640991 + }, + { + "auxiliary_loss_clip": 0.01144488, + "auxiliary_loss_mlp": 0.01081244, + "balance_loss_clip": 1.00134683, + "balance_loss_mlp": 0.99999124, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7064962573999521, + "language_loss": 0.50423783, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52649516, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.1025593280792236 + }, + { + "auxiliary_loss_clip": 0.01148837, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.00051808, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.7276770005552473, + "language_loss": 0.71709335, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73963219, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.578564167022705 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01104552, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00050628, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.6444690944792997, + "language_loss": 0.79920226, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82160497, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.708451747894287 + }, + { + "auxiliary_loss_clip": 0.01098897, + "auxiliary_loss_mlp": 0.0110498, + "balance_loss_clip": 1.00171483, + "balance_loss_mlp": 1.00036192, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.7951797518747583, + "language_loss": 0.7883541, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81039286, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.6907687187194824 + }, + { + "auxiliary_loss_clip": 0.011017, + "auxiliary_loss_mlp": 0.01104378, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00061798, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.4880211511941506, + "language_loss": 0.70655096, + "learning_rate": 5.96220564921515e-07, + "loss": 0.72861183, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.6425862312316895 + }, + { + "auxiliary_loss_clip": 0.0113371, + "auxiliary_loss_mlp": 0.00747326, + "balance_loss_clip": 1.00173771, + "balance_loss_mlp": 1.00045741, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.4454853891323334, + "language_loss": 0.75457573, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77338606, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.6400868892669678 + }, + { + "auxiliary_loss_clip": 0.01133685, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.0017041, + "balance_loss_mlp": 1.00052512, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 1.8084899670970616, + "language_loss": 0.75925756, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78164196, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.5943024158477783 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01107181, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00056016, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.081803180214871, + "language_loss": 0.66518402, + "learning_rate": 5.953885806282768e-07, + "loss": 0.68747342, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 2.707904100418091 + }, + { + "auxiliary_loss_clip": 0.0113432, + "auxiliary_loss_mlp": 0.01105, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.0005722, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 1.7658280914790812, + "language_loss": 0.68344879, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70584196, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.607828140258789 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_clip": 1.00162792, + "balance_loss_mlp": 1.00045562, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.5540821164123353, + "language_loss": 0.74983132, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77220774, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.657363176345825 + }, + { + "auxiliary_loss_clip": 0.01150874, + "auxiliary_loss_mlp": 0.01105286, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 4.670628948634873, + "language_loss": 0.7403264, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76288801, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.552781581878662 + }, + { + "auxiliary_loss_clip": 0.01165446, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00052571, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.9525445437259665, + "language_loss": 0.63128841, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65399051, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.5199196338653564 + }, + { + "auxiliary_loss_clip": 0.01054247, + "auxiliary_loss_mlp": 0.01105522, + "balance_loss_clip": 1.00155365, + "balance_loss_mlp": 1.00052297, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 2.5182778724207577, + "language_loss": 0.66642195, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68801963, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 3.010139226913452 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01105411, + "balance_loss_clip": 1.00188231, + "balance_loss_mlp": 1.00069761, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.6768260803470127, + "language_loss": 0.6738075, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69634545, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 4.578796863555908 + }, + { + "auxiliary_loss_clip": 0.01165385, + "auxiliary_loss_mlp": 0.01104531, + "balance_loss_clip": 1.00190961, + "balance_loss_mlp": 1.00058043, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.755228919519254, + "language_loss": 0.71657431, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73927349, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.7944188117980957 + }, + { + "auxiliary_loss_clip": 0.01102687, + "auxiliary_loss_mlp": 0.01105311, + "balance_loss_clip": 1.00167811, + "balance_loss_mlp": 1.00040698, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.8221540093862478, + "language_loss": 0.73596448, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75804448, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.693791627883911 + }, + { + "auxiliary_loss_clip": 0.01132462, + "auxiliary_loss_mlp": 0.01105587, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00058758, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.1491154778107546, + "language_loss": 0.76569963, + "learning_rate": 5.928955050857456e-07, + "loss": 0.7880801, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 2.5406289100646973 + }, + { + "auxiliary_loss_clip": 0.01119657, + "auxiliary_loss_mlp": 0.01105336, + "balance_loss_clip": 1.00170398, + "balance_loss_mlp": 1.00052691, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.484509702838142, + "language_loss": 0.69128919, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71353912, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.6077120304107666 + }, + { + "auxiliary_loss_clip": 0.01119257, + "auxiliary_loss_mlp": 0.011043, + "balance_loss_clip": 1.00165617, + "balance_loss_mlp": 1.00063586, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.0688407123183796, + "language_loss": 0.71725643, + "learning_rate": 5.923420749619974e-07, + "loss": 0.739492, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 2.5938754081726074 + }, + { + "auxiliary_loss_clip": 0.01165316, + "auxiliary_loss_mlp": 0.00747488, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.0004065, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.574275349316166, + "language_loss": 0.72622514, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74535316, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.471540689468384 + }, + { + "auxiliary_loss_clip": 0.01116703, + "auxiliary_loss_mlp": 0.01104733, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00040078, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 3.0025998442020048, + "language_loss": 0.67223811, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69445246, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.6171438694000244 + }, + { + "auxiliary_loss_clip": 0.0113388, + "auxiliary_loss_mlp": 0.01103968, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00059009, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.5822589899478843, + "language_loss": 0.78476655, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80714512, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.5761466026306152 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00063848, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.439074245367801, + "language_loss": 0.75662112, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77914917, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.5931403636932373 + }, + { + "auxiliary_loss_clip": 0.01101051, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_clip": 1.0018245, + "balance_loss_mlp": 1.00057769, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 8.665857616681727, + "language_loss": 0.62632525, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64838874, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 4.2615065574646 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.01104116, + "balance_loss_clip": 1.00167155, + "balance_loss_mlp": 1.00045168, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.6154727265036086, + "language_loss": 0.74839044, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77045214, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 4.184586048126221 + }, + { + "auxiliary_loss_clip": 0.01115723, + "auxiliary_loss_mlp": 0.01105163, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00044966, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.602209828349244, + "language_loss": 0.6257236, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64793247, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.6626393795013428 + }, + { + "auxiliary_loss_clip": 0.0116108, + "auxiliary_loss_mlp": 0.01081228, + "balance_loss_clip": 1.00137448, + "balance_loss_mlp": 0.9999752, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9550211799798889, + "language_loss": 0.6076864, + "learning_rate": 5.901304904471307e-07, + "loss": 0.63010949, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.85738468170166 + }, + { + "auxiliary_loss_clip": 0.01132132, + "auxiliary_loss_mlp": 0.01105011, + "balance_loss_clip": 1.00168991, + "balance_loss_mlp": 1.00058365, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.008818216872592, + "language_loss": 0.78769904, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81007046, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 4.144235849380493 + }, + { + "auxiliary_loss_clip": 0.01135683, + "auxiliary_loss_mlp": 0.0110485, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00051785, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.121947555327092, + "language_loss": 0.78214097, + "learning_rate": 5.895781287327612e-07, + "loss": 0.80454624, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.581392765045166 + }, + { + "auxiliary_loss_clip": 0.01165567, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00048637, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.692239613917611, + "language_loss": 0.83388901, + "learning_rate": 5.893020280953493e-07, + "loss": 0.8565948, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.4846012592315674 + }, + { + "auxiliary_loss_clip": 0.01165463, + "auxiliary_loss_mlp": 0.01105066, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00054288, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.714196045024025, + "language_loss": 0.83255059, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85525584, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.4977869987487793 + }, + { + "auxiliary_loss_clip": 0.01117309, + "auxiliary_loss_mlp": 0.01103591, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00040388, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.4829437014522149, + "language_loss": 0.71285796, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73506701, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.6290488243103027 + }, + { + "auxiliary_loss_clip": 0.01165514, + "auxiliary_loss_mlp": 0.00747539, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00044513, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.6709832327365086, + "language_loss": 0.6894998, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70863032, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 2.5185914039611816 + }, + { + "auxiliary_loss_clip": 0.01148577, + "auxiliary_loss_mlp": 0.01104367, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00051212, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.5678834190432496, + "language_loss": 0.92114198, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94367146, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.53369402885437 + }, + { + "auxiliary_loss_clip": 0.01133695, + "auxiliary_loss_mlp": 0.01104294, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.00053406, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.8216487564270614, + "language_loss": 0.65535831, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67773819, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.6888883113861084 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.0110418, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00032449, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.4414733099722061, + "language_loss": 0.73524576, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75777113, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.614906072616577 + }, + { + "auxiliary_loss_clip": 0.01148688, + "auxiliary_loss_mlp": 0.01104379, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00061917, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.8887602802745977, + "language_loss": 0.71251225, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73504293, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 2.495468854904175 + }, + { + "auxiliary_loss_clip": 0.01165511, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00036371, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 2.060745161804566, + "language_loss": 0.6634298, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68613088, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 2.4817676544189453 + }, + { + "auxiliary_loss_clip": 0.01115657, + "auxiliary_loss_mlp": 0.01105323, + "balance_loss_clip": 1.00166118, + "balance_loss_mlp": 1.00051403, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.5942863193397103, + "language_loss": 0.80925238, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83146214, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.6207706928253174 + }, + { + "auxiliary_loss_clip": 0.01116349, + "auxiliary_loss_mlp": 0.01104194, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00062501, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.7952793610651516, + "language_loss": 0.72104514, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74325061, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.61252498626709 + }, + { + "auxiliary_loss_clip": 0.01033287, + "auxiliary_loss_mlp": 0.01103608, + "balance_loss_clip": 1.00154161, + "balance_loss_mlp": 1.00051618, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.8034164998844944, + "language_loss": 0.8040117, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82538068, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 2.8234455585479736 + }, + { + "auxiliary_loss_clip": 0.01117024, + "auxiliary_loss_mlp": 0.01105647, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.735567719802569, + "language_loss": 0.83375615, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85598284, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 2.648066520690918 + }, + { + "auxiliary_loss_clip": 0.01132289, + "auxiliary_loss_mlp": 0.01104197, + "balance_loss_clip": 1.00169885, + "balance_loss_mlp": 1.0004369, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.637863342945225, + "language_loss": 0.62802476, + "learning_rate": 5.857175915537845e-07, + "loss": 0.65038961, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.5812079906463623 + }, + { + "auxiliary_loss_clip": 0.01131466, + "auxiliary_loss_mlp": 0.00747535, + "balance_loss_clip": 1.00198519, + "balance_loss_mlp": 1.00043941, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.132663256665882, + "language_loss": 0.63433236, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65312231, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.5612688064575195 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01104165, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00050044, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.7123680509652446, + "language_loss": 0.66158384, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68396235, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.5780792236328125 + }, + { + "auxiliary_loss_clip": 0.01131603, + "auxiliary_loss_mlp": 0.01103366, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00056016, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.5203912215840027, + "language_loss": 0.67510259, + "learning_rate": 5.848917001679335e-07, + "loss": 0.69745231, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.576500654220581 + }, + { + "auxiliary_loss_clip": 0.01149894, + "auxiliary_loss_mlp": 0.01104222, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00055754, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.885248830653275, + "language_loss": 0.66735744, + "learning_rate": 5.846165103474967e-07, + "loss": 0.68989861, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.5335748195648193 + }, + { + "auxiliary_loss_clip": 0.01135831, + "auxiliary_loss_mlp": 0.01103532, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00053561, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.7151878471632995, + "language_loss": 0.6139977, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63639128, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.6738545894622803 + }, + { + "auxiliary_loss_clip": 0.01165417, + "auxiliary_loss_mlp": 0.01104185, + "balance_loss_clip": 1.00204861, + "balance_loss_mlp": 1.00080657, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 2.122097507885295, + "language_loss": 0.79897606, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82167208, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.510256767272949 + }, + { + "auxiliary_loss_clip": 0.0116543, + "auxiliary_loss_mlp": 0.01104976, + "balance_loss_clip": 1.00194108, + "balance_loss_mlp": 1.00045359, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 3.157096911950654, + "language_loss": 0.79795837, + "learning_rate": 5.837912629568198e-07, + "loss": 0.82066238, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.4871954917907715 + }, + { + "auxiliary_loss_clip": 0.01150456, + "auxiliary_loss_mlp": 0.01103068, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.000453, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3234691115094823, + "language_loss": 0.73155612, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75409138, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.555349349975586 + }, + { + "auxiliary_loss_clip": 0.0113245, + "auxiliary_loss_mlp": 0.01105606, + "balance_loss_clip": 1.00191081, + "balance_loss_mlp": 1.00051117, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.008553005374338, + "language_loss": 0.75092274, + "learning_rate": 5.83241366526202e-07, + "loss": 0.77330327, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 4.424482107162476 + }, + { + "auxiliary_loss_clip": 0.01115325, + "auxiliary_loss_mlp": 0.00747355, + "balance_loss_clip": 1.00168538, + "balance_loss_mlp": 1.00039077, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.4659864639636688, + "language_loss": 0.71536982, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73399663, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.6789121627807617 + }, + { + "auxiliary_loss_clip": 0.01165358, + "auxiliary_loss_mlp": 0.0110451, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.5967946188271036, + "language_loss": 0.81509131, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83779001, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 2.508633852005005 + }, + { + "auxiliary_loss_clip": 0.01134761, + "auxiliary_loss_mlp": 0.01105594, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00049961, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.800191445866447, + "language_loss": 0.6999433, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72234684, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.6194241046905518 + }, + { + "auxiliary_loss_clip": 0.01165331, + "auxiliary_loss_mlp": 0.01104347, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.0004915, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.5672013796162874, + "language_loss": 0.70653343, + "learning_rate": 5.821422184318893e-07, + "loss": 0.72923017, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.522641658782959 + }, + { + "auxiliary_loss_clip": 0.01084039, + "auxiliary_loss_mlp": 0.01105387, + "balance_loss_clip": 1.00162268, + "balance_loss_mlp": 1.0007689, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.4005057695304652, + "language_loss": 0.5963279, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61822218, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.7247159481048584 + }, + { + "auxiliary_loss_clip": 0.01134837, + "auxiliary_loss_mlp": 0.01104144, + "balance_loss_clip": 1.00179183, + "balance_loss_mlp": 1.00057483, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.4752195021275554, + "language_loss": 0.60028148, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62267131, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.6673145294189453 + }, + { + "auxiliary_loss_clip": 0.01117606, + "auxiliary_loss_mlp": 0.0110467, + "balance_loss_clip": 1.00172579, + "balance_loss_mlp": 1.00043356, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.6108675560561854, + "language_loss": 0.73085278, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75307548, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.6368818283081055 + }, + { + "auxiliary_loss_clip": 0.01128863, + "auxiliary_loss_mlp": 0.01081242, + "balance_loss_clip": 1.00148094, + "balance_loss_mlp": 0.99998897, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8047879707170768, + "language_loss": 0.67690372, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69900477, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.250725507736206 + }, + { + "auxiliary_loss_clip": 0.01118833, + "auxiliary_loss_mlp": 0.01105338, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00052953, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.8145380529908484, + "language_loss": 0.84394312, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86618483, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 3.9765803813934326 + }, + { + "auxiliary_loss_clip": 0.01115331, + "auxiliary_loss_mlp": 0.01103735, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00054729, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.110275173383258, + "language_loss": 0.7484529, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77064353, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 4.058851480484009 + }, + { + "auxiliary_loss_clip": 0.01132178, + "auxiliary_loss_mlp": 0.01105813, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00052762, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.081982552336121, + "language_loss": 0.77121007, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79358995, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.5568583011627197 + }, + { + "auxiliary_loss_clip": 0.01117271, + "auxiliary_loss_mlp": 0.01103413, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 2.049630456452277, + "language_loss": 0.82148212, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84368896, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.669375419616699 + }, + { + "auxiliary_loss_clip": 0.01133939, + "auxiliary_loss_mlp": 0.01105685, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00059009, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.1174933007283414, + "language_loss": 0.82549179, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84788799, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 3.9293224811553955 + }, + { + "auxiliary_loss_clip": 0.01133308, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00056171, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.9080652691963342, + "language_loss": 0.7325778, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75495124, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.564286470413208 + }, + { + "auxiliary_loss_clip": 0.01144288, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_clip": 1.0012784, + "balance_loss_mlp": 0.99998122, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.9111612732151557, + "language_loss": 0.60910511, + "learning_rate": 5.791239988143024e-07, + "loss": 0.63136041, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.158438205718994 + }, + { + "auxiliary_loss_clip": 0.0116541, + "auxiliary_loss_mlp": 0.01104665, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00061941, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 1.9543541734650745, + "language_loss": 0.67253304, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69523382, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.4969892501831055 + }, + { + "auxiliary_loss_clip": 0.01165271, + "auxiliary_loss_mlp": 0.01104139, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00037932, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.7215353171606416, + "language_loss": 0.75809205, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78078616, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.5069949626922607 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01104701, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00055957, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 4.254856990714836, + "language_loss": 0.6305384, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65293795, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.6518630981445312 + }, + { + "auxiliary_loss_clip": 0.01103269, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00046182, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 5.683434049240958, + "language_loss": 0.73959154, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75809789, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.649174451828003 + }, + { + "auxiliary_loss_clip": 0.01148662, + "auxiliary_loss_mlp": 0.01104263, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.0005033, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.4416993400741145, + "language_loss": 0.69004649, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71257573, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.653981924057007 + }, + { + "auxiliary_loss_clip": 0.0114926, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.0005554, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.9403925009792957, + "language_loss": 0.6339103, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65646613, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.527264356613159 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01103134, + "balance_loss_clip": 1.00169134, + "balance_loss_mlp": 1.00051844, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.6071898519958348, + "language_loss": 0.7790792, + "learning_rate": 5.772067071539786e-07, + "loss": 0.80144823, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.6163978576660156 + }, + { + "auxiliary_loss_clip": 0.01161028, + "auxiliary_loss_mlp": 0.01081233, + "balance_loss_clip": 1.00133729, + "balance_loss_mlp": 0.99998039, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8114616247055286, + "language_loss": 0.61498749, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63741016, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.1420912742614746 + }, + { + "auxiliary_loss_clip": 0.0111751, + "auxiliary_loss_mlp": 0.00747535, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00048435, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 2.1014975667873834, + "language_loss": 0.74198836, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76063877, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 2.677039384841919 + }, + { + "auxiliary_loss_clip": 0.01132029, + "auxiliary_loss_mlp": 0.01104209, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00054491, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.867169525241686, + "language_loss": 0.75230998, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77467239, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.545714855194092 + }, + { + "auxiliary_loss_clip": 0.01131939, + "auxiliary_loss_mlp": 0.01104043, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00056911, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.6754594112111274, + "language_loss": 0.73184776, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75420755, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.5395331382751465 + }, + { + "auxiliary_loss_clip": 0.01165518, + "auxiliary_loss_mlp": 0.01104686, + "balance_loss_clip": 1.00210154, + "balance_loss_mlp": 1.00064015, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.7006102855473528, + "language_loss": 0.64720416, + "learning_rate": 5.758388314770408e-07, + "loss": 0.6699062, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.574125051498413 + }, + { + "auxiliary_loss_clip": 0.0110289, + "auxiliary_loss_mlp": 0.01105268, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00055504, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 2.343612071766808, + "language_loss": 0.68576235, + "learning_rate": 5.7556541831317e-07, + "loss": 0.7078439, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.637627363204956 + }, + { + "auxiliary_loss_clip": 0.01134169, + "auxiliary_loss_mlp": 0.01104572, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00052583, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.7376316003011807, + "language_loss": 0.80980527, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83219272, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.595695734024048 + }, + { + "auxiliary_loss_clip": 0.0114868, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_clip": 1.00178659, + "balance_loss_mlp": 1.00051856, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.6131860748918918, + "language_loss": 0.66462326, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68715763, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.6724047660827637 + }, + { + "auxiliary_loss_clip": 0.01165434, + "auxiliary_loss_mlp": 0.01105421, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.0006125, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.102691507246767, + "language_loss": 0.65399557, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67670417, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 2.469402551651001 + }, + { + "auxiliary_loss_clip": 0.0115057, + "auxiliary_loss_mlp": 0.01104862, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00053024, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 1.9866659593965674, + "language_loss": 0.70045817, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72301251, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.5419704914093018 + }, + { + "auxiliary_loss_clip": 0.01132021, + "auxiliary_loss_mlp": 0.0110552, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00042486, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.8681339418683778, + "language_loss": 0.67223972, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69461513, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.6257076263427734 + }, + { + "auxiliary_loss_clip": 0.0114861, + "auxiliary_loss_mlp": 0.01105239, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00043011, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.396618386718846, + "language_loss": 0.66415441, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68669295, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.5269718170166016 + }, + { + "auxiliary_loss_clip": 0.01134674, + "auxiliary_loss_mlp": 0.01104717, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.00048018, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 2.2442030415243197, + "language_loss": 0.75788271, + "learning_rate": 5.736530391580765e-07, + "loss": 0.7802766, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.6102190017700195 + }, + { + "auxiliary_loss_clip": 0.01117136, + "auxiliary_loss_mlp": 0.01105633, + "balance_loss_clip": 1.00179994, + "balance_loss_mlp": 1.00063312, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.7435724349311734, + "language_loss": 0.78860527, + "learning_rate": 5.733800584019508e-07, + "loss": 0.81083298, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.60408091545105 + }, + { + "auxiliary_loss_clip": 0.0113619, + "auxiliary_loss_mlp": 0.01104119, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00055027, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.4461749729418705, + "language_loss": 0.8022176, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82462072, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 4.016973257064819 + }, + { + "auxiliary_loss_clip": 0.01130813, + "auxiliary_loss_mlp": 0.0110567, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00047934, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.539333826966178, + "language_loss": 0.72747707, + "learning_rate": 5.728342591927611e-07, + "loss": 0.74984193, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.5992846488952637 + }, + { + "auxiliary_loss_clip": 0.01150604, + "auxiliary_loss_mlp": 0.01103893, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00051498, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 1.9686530276310856, + "language_loss": 0.67088485, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69342983, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.5322775840759277 + }, + { + "auxiliary_loss_clip": 0.01146399, + "auxiliary_loss_mlp": 0.01081649, + "balance_loss_clip": 1.00131559, + "balance_loss_mlp": 1.0000149, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6739314538559004, + "language_loss": 0.48990083, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51218134, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 3.0800094604492188 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.0110395, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00066698, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.455119693323736, + "language_loss": 0.76656145, + "learning_rate": 5.720159662918451e-07, + "loss": 0.7890861, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.536360740661621 + }, + { + "auxiliary_loss_clip": 0.01117659, + "auxiliary_loss_mlp": 0.01104674, + "balance_loss_clip": 1.0017699, + "balance_loss_mlp": 1.00053263, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.4516502887900793, + "language_loss": 0.69048232, + "learning_rate": 5.717433102763462e-07, + "loss": 0.71270561, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.6565933227539062 + }, + { + "auxiliary_loss_clip": 0.01144355, + "auxiliary_loss_mlp": 0.01081226, + "balance_loss_clip": 1.00127125, + "balance_loss_mlp": 0.99997294, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7548841513686296, + "language_loss": 0.62752765, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64978349, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.106806993484497 + }, + { + "auxiliary_loss_clip": 0.01115041, + "auxiliary_loss_mlp": 0.01103644, + "balance_loss_clip": 1.00167632, + "balance_loss_mlp": 1.00055146, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.6975797780204962, + "language_loss": 0.71406496, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73625183, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.669976234436035 + }, + { + "auxiliary_loss_clip": 0.01085715, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_clip": 1.00162888, + "balance_loss_mlp": 1.00077331, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 1.8147929323681007, + "language_loss": 0.80599409, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82789469, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.6905670166015625 + }, + { + "auxiliary_loss_clip": 0.01165611, + "auxiliary_loss_mlp": 0.01106295, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00053275, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.953094710987435, + "language_loss": 0.79908133, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82180041, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.5173354148864746 + }, + { + "auxiliary_loss_clip": 0.01117721, + "auxiliary_loss_mlp": 0.01104973, + "balance_loss_clip": 1.00164783, + "balance_loss_mlp": 1.00064111, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.057907431929199, + "language_loss": 0.79309231, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81531924, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 5.443384408950806 + }, + { + "auxiliary_loss_clip": 0.01148541, + "auxiliary_loss_mlp": 0.01103791, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00041282, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.4808871774101355, + "language_loss": 0.68439835, + "learning_rate": 5.701085118974505e-07, + "loss": 0.7069217, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.600935697555542 + }, + { + "auxiliary_loss_clip": 0.01150698, + "auxiliary_loss_mlp": 0.01105623, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00052774, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 1.9243127264818631, + "language_loss": 0.72776115, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75032431, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.534879684448242 + }, + { + "auxiliary_loss_clip": 0.01132355, + "auxiliary_loss_mlp": 0.01081222, + "balance_loss_clip": 1.00138962, + "balance_loss_mlp": 0.9999693, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8594144480786705, + "language_loss": 0.64927095, + "learning_rate": 5.695640127673347e-07, + "loss": 0.67140675, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 4.496395587921143 + }, + { + "auxiliary_loss_clip": 0.01148773, + "auxiliary_loss_mlp": 0.01104548, + "balance_loss_clip": 1.00164485, + "balance_loss_mlp": 1.000597, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.5941579416740936, + "language_loss": 0.79306293, + "learning_rate": 5.692918445605293e-07, + "loss": 0.8155961, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.54453706741333 + }, + { + "auxiliary_loss_clip": 0.01148657, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_clip": 1.00184071, + "balance_loss_mlp": 1.00036192, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.6692961494337097, + "language_loss": 0.68938613, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71191108, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.5966711044311523 + }, + { + "auxiliary_loss_clip": 0.01165522, + "auxiliary_loss_mlp": 0.01104993, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00056529, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.8929400455383583, + "language_loss": 0.70405889, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72676408, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 2.5628137588500977 + }, + { + "auxiliary_loss_clip": 0.01148626, + "auxiliary_loss_mlp": 0.01103846, + "balance_loss_clip": 1.00173891, + "balance_loss_mlp": 1.00046825, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.552664813577874, + "language_loss": 0.83512139, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85764611, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.5949368476867676 + }, + { + "auxiliary_loss_clip": 0.01131815, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00066459, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7906717320774364, + "language_loss": 0.68577367, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70813799, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.568166732788086 + }, + { + "auxiliary_loss_clip": 0.01148678, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00042439, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.9907123583934037, + "language_loss": 0.70368421, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72620803, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.5636470317840576 + }, + { + "auxiliary_loss_clip": 0.01149245, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.00190318, + "balance_loss_mlp": 1.00071883, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.5945575544056523, + "language_loss": 0.79349911, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81605542, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.5398685932159424 + }, + { + "auxiliary_loss_clip": 0.01165398, + "auxiliary_loss_mlp": 0.00747377, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.000494, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.8006427841739583, + "language_loss": 0.88280904, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90193677, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.577261209487915 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01105857, + "balance_loss_clip": 1.00162864, + "balance_loss_mlp": 1.00057173, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 1.9519069609784518, + "language_loss": 0.83095741, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85303795, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.6113884449005127 + }, + { + "auxiliary_loss_clip": 0.01131925, + "auxiliary_loss_mlp": 0.01103071, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00055087, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.500085106083242, + "language_loss": 0.78558969, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80793965, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 2.553366184234619 + }, + { + "auxiliary_loss_clip": 0.01117484, + "auxiliary_loss_mlp": 0.01103663, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.00057125, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.9474820198388816, + "language_loss": 0.64348245, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66569388, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.5927605628967285 + }, + { + "auxiliary_loss_clip": 0.01134328, + "auxiliary_loss_mlp": 0.01105142, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00052369, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8793084389692263, + "language_loss": 0.6636793, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68607402, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.637169361114502 + }, + { + "auxiliary_loss_clip": 0.0114881, + "auxiliary_loss_mlp": 0.01105418, + "balance_loss_clip": 1.00185812, + "balance_loss_mlp": 1.00051379, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.504859440952284, + "language_loss": 0.73035294, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75289524, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 2.5998880863189697 + }, + { + "auxiliary_loss_clip": 0.01120859, + "auxiliary_loss_mlp": 0.01103617, + "balance_loss_clip": 1.00173783, + "balance_loss_mlp": 1.00052452, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.7122868122729225, + "language_loss": 0.73038661, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75263137, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 2.6735141277313232 + }, + { + "auxiliary_loss_clip": 0.01123057, + "auxiliary_loss_mlp": 0.01081698, + "balance_loss_clip": 1.00132215, + "balance_loss_mlp": 1.00006378, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7674086033390107, + "language_loss": 0.56709528, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58914286, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.131718397140503 + }, + { + "auxiliary_loss_clip": 0.01148712, + "auxiliary_loss_mlp": 0.01104936, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.00050867, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.7355236244190981, + "language_loss": 0.75029147, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77282792, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.563403844833374 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01104326, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00047112, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.8811688462451097, + "language_loss": 0.72354478, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74594426, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.6851046085357666 + }, + { + "auxiliary_loss_clip": 0.01150728, + "auxiliary_loss_mlp": 0.01104974, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.0006423, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.1167887485807833, + "language_loss": 0.73089516, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75345224, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.547973871231079 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.00747609, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00050271, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.454723291268733, + "language_loss": 0.54053396, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55918968, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.61135196685791 + }, + { + "auxiliary_loss_clip": 0.01101125, + "auxiliary_loss_mlp": 0.01105858, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00057268, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 1.840674115441859, + "language_loss": 0.79432935, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81639916, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 2.653214693069458 + }, + { + "auxiliary_loss_clip": 0.0111695, + "auxiliary_loss_mlp": 0.01104156, + "balance_loss_clip": 1.00171161, + "balance_loss_mlp": 1.00049186, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 2.185890914672562, + "language_loss": 0.77587056, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79808164, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.6091597080230713 + }, + { + "auxiliary_loss_clip": 0.01148652, + "auxiliary_loss_mlp": 0.01104527, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00048137, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.4427050967038852, + "language_loss": 0.80210805, + "learning_rate": 5.635888604430059e-07, + "loss": 0.8246398, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.58042573928833 + }, + { + "auxiliary_loss_clip": 0.01133975, + "auxiliary_loss_mlp": 0.01105539, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00044417, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.6095714407619932, + "language_loss": 0.62719524, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64959037, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.588895320892334 + }, + { + "auxiliary_loss_clip": 0.01116135, + "auxiliary_loss_mlp": 0.01103641, + "balance_loss_clip": 1.00166094, + "balance_loss_mlp": 1.00045347, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.2695037214497797, + "language_loss": 0.76092875, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78312659, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.6342902183532715 + }, + { + "auxiliary_loss_clip": 0.01133494, + "auxiliary_loss_mlp": 0.01103909, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00053096, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.5332583467851684, + "language_loss": 0.67985827, + "learning_rate": 5.627761070828974e-07, + "loss": 0.7022323, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 4.052229404449463 + }, + { + "auxiliary_loss_clip": 0.01118737, + "auxiliary_loss_mlp": 0.0074751, + "balance_loss_clip": 1.00158298, + "balance_loss_mlp": 1.00041962, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.087648754519299, + "language_loss": 0.83299196, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85165447, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.641364336013794 + }, + { + "auxiliary_loss_clip": 0.01132107, + "auxiliary_loss_mlp": 0.01105002, + "balance_loss_clip": 1.00180995, + "balance_loss_mlp": 1.00057507, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 1.7913382025297624, + "language_loss": 0.82537997, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84775108, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.536720037460327 + }, + { + "auxiliary_loss_clip": 0.0111669, + "auxiliary_loss_mlp": 0.00747358, + "balance_loss_clip": 1.00168586, + "balance_loss_mlp": 1.00040984, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.90576199516046, + "language_loss": 0.77299285, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79163325, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.669102668762207 + }, + { + "auxiliary_loss_clip": 0.0110299, + "auxiliary_loss_mlp": 0.01106132, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00065541, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6102806761106048, + "language_loss": 0.71979743, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74188864, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.6596264839172363 + }, + { + "auxiliary_loss_clip": 0.01135776, + "auxiliary_loss_mlp": 0.01104859, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00062275, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.7784373800849118, + "language_loss": 0.64615631, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66856271, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 2.5627756118774414 + }, + { + "auxiliary_loss_clip": 0.01150322, + "auxiliary_loss_mlp": 0.01104566, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00052023, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.8217564481347301, + "language_loss": 0.71024609, + "learning_rate": 5.611520721310515e-07, + "loss": 0.732795, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.5292975902557373 + }, + { + "auxiliary_loss_clip": 0.01117332, + "auxiliary_loss_mlp": 0.01105451, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.00064194, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.8393656661577804, + "language_loss": 0.69732618, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71955401, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.6589975357055664 + }, + { + "auxiliary_loss_clip": 0.01133871, + "auxiliary_loss_mlp": 0.01105526, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00052691, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.4264013611290611, + "language_loss": 0.69447494, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71686888, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.7092456817626953 + }, + { + "auxiliary_loss_clip": 0.01148855, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00055885, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.651494890834334, + "language_loss": 0.81611782, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83865237, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.5591964721679688 + }, + { + "auxiliary_loss_clip": 0.01117362, + "auxiliary_loss_mlp": 0.01105631, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00053573, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.578501352582401, + "language_loss": 0.76973706, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79196703, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 5.4932520389556885 + }, + { + "auxiliary_loss_clip": 0.01115651, + "auxiliary_loss_mlp": 0.011063, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00053763, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.1696089271469785, + "language_loss": 0.72820187, + "learning_rate": 5.598002100115933e-07, + "loss": 0.7504214, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.582132339477539 + }, + { + "auxiliary_loss_clip": 0.01148633, + "auxiliary_loss_mlp": 0.01104418, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00037193, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 2.01580368542343, + "language_loss": 0.70337307, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72590351, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.5370819568634033 + }, + { + "auxiliary_loss_clip": 0.01165345, + "auxiliary_loss_mlp": 0.01104283, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00052309, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4247720591508464, + "language_loss": 0.72547716, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74817336, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.5292701721191406 + }, + { + "auxiliary_loss_clip": 0.01086914, + "auxiliary_loss_mlp": 0.01104938, + "balance_loss_clip": 1.00145972, + "balance_loss_mlp": 1.00041533, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.672376537344719, + "language_loss": 0.71491194, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73683047, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 4.072724342346191 + }, + { + "auxiliary_loss_clip": 0.01115427, + "auxiliary_loss_mlp": 0.0110504, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00051701, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.800998862411828, + "language_loss": 0.66608608, + "learning_rate": 5.587197032798461e-07, + "loss": 0.68829077, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.6496829986572266 + }, + { + "auxiliary_loss_clip": 0.0115078, + "auxiliary_loss_mlp": 0.01104535, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00048923, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.514645137045383, + "language_loss": 0.72338927, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74594235, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.561896800994873 + }, + { + "auxiliary_loss_clip": 0.01135335, + "auxiliary_loss_mlp": 0.01103941, + "balance_loss_clip": 1.0017606, + "balance_loss_mlp": 1.00056303, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.5457767874492419, + "language_loss": 0.73410499, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75649774, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 2.6884186267852783 + }, + { + "auxiliary_loss_clip": 0.01165391, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00050163, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 1.8202684868081984, + "language_loss": 0.69247663, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71517897, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.4922308921813965 + }, + { + "auxiliary_loss_clip": 0.01132326, + "auxiliary_loss_mlp": 0.01104863, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.0005306, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.962869115229455, + "language_loss": 0.6449846, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66735649, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.5973005294799805 + }, + { + "auxiliary_loss_clip": 0.01115613, + "auxiliary_loss_mlp": 0.01104991, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00056338, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 2.0181933709438775, + "language_loss": 0.6518597, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67406577, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.6874778270721436 + }, + { + "auxiliary_loss_clip": 0.01134455, + "auxiliary_loss_mlp": 0.01103585, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00039744, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.7459527423560237, + "language_loss": 0.83424306, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85662347, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.5625498294830322 + }, + { + "auxiliary_loss_clip": 0.01135984, + "auxiliary_loss_mlp": 0.01104498, + "balance_loss_clip": 1.00196826, + "balance_loss_mlp": 1.00064313, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.4228288947312833, + "language_loss": 0.68329054, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70569533, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.6368536949157715 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01104676, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00053513, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.5771473519455477, + "language_loss": 0.74263942, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76502466, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.6131694316864014 + }, + { + "auxiliary_loss_clip": 0.01150837, + "auxiliary_loss_mlp": 0.01105042, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00051904, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 2.014760613233843, + "language_loss": 0.78311265, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80567145, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.5206778049468994 + }, + { + "auxiliary_loss_clip": 0.01133763, + "auxiliary_loss_mlp": 0.01105015, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00039649, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 2.2387646216549535, + "language_loss": 0.79770362, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82009137, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.5545547008514404 + }, + { + "auxiliary_loss_clip": 0.01143918, + "auxiliary_loss_mlp": 0.01081238, + "balance_loss_clip": 1.00131106, + "balance_loss_mlp": 0.99998456, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8183845164997978, + "language_loss": 0.56392378, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58617538, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.1427013874053955 + }, + { + "auxiliary_loss_clip": 0.01150709, + "auxiliary_loss_mlp": 0.01105137, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00061464, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.711124278350854, + "language_loss": 0.63249087, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65504938, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.5137734413146973 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.00747474, + "balance_loss_clip": 1.00170791, + "balance_loss_mlp": 1.00045621, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 1.978119605225486, + "language_loss": 0.64281416, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66129339, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 2.6870574951171875 + }, + { + "auxiliary_loss_clip": 0.0113201, + "auxiliary_loss_mlp": 0.01104723, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00067735, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.4197046544459726, + "language_loss": 0.73053807, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75290537, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.5995700359344482 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01104529, + "balance_loss_clip": 1.00169659, + "balance_loss_mlp": 1.00057864, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.8180637769087447, + "language_loss": 0.80066514, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82286054, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.6324448585510254 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00048494, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 1.9527158174562174, + "language_loss": 0.83831847, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85729611, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.5462756156921387 + }, + { + "auxiliary_loss_clip": 0.01150718, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00056577, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.5510308465179032, + "language_loss": 0.72971749, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75227845, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.568941831588745 + }, + { + "auxiliary_loss_clip": 0.01117341, + "auxiliary_loss_mlp": 0.01103967, + "balance_loss_clip": 1.00152266, + "balance_loss_mlp": 1.0003984, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 2.745396404080664, + "language_loss": 0.63363779, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65585083, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.6390490531921387 + }, + { + "auxiliary_loss_clip": 0.01165524, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00056529, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 4.236405409321986, + "language_loss": 0.79818898, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82090652, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.5195472240448 + }, + { + "auxiliary_loss_clip": 0.01165299, + "auxiliary_loss_mlp": 0.01104077, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00060368, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.416813921232011, + "language_loss": 0.66462886, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68732268, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.5196287631988525 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01104459, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00060344, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 2.114668280434354, + "language_loss": 0.77273464, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79492182, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.6348814964294434 + }, + { + "auxiliary_loss_clip": 0.01165483, + "auxiliary_loss_mlp": 0.01104857, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.00043011, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.8941183635912024, + "language_loss": 0.69945705, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72216046, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 3.9602513313293457 + }, + { + "auxiliary_loss_clip": 0.0111521, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.00056303, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.57449323679556, + "language_loss": 0.7402209, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76241434, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.610766887664795 + }, + { + "auxiliary_loss_clip": 0.01117423, + "auxiliary_loss_mlp": 0.01105108, + "balance_loss_clip": 1.0016228, + "balance_loss_mlp": 1.00058544, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 2.1984341162358483, + "language_loss": 0.73755699, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75978225, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 2.5992348194122314 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01104801, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00056481, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.790445088301375, + "language_loss": 0.74031389, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76286805, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.602442741394043 + }, + { + "auxiliary_loss_clip": 0.01165401, + "auxiliary_loss_mlp": 0.01105122, + "balance_loss_clip": 1.00177324, + "balance_loss_mlp": 1.00040913, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.6797630143848938, + "language_loss": 0.73122871, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75393397, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.5253560543060303 + }, + { + "auxiliary_loss_clip": 0.01148672, + "auxiliary_loss_mlp": 0.01104235, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00047576, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.9579109553484615, + "language_loss": 0.84113753, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86366659, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.4988136291503906 + }, + { + "auxiliary_loss_clip": 0.01117416, + "auxiliary_loss_mlp": 0.01105003, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.0005753, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.5810664673534154, + "language_loss": 0.77400529, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79622948, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.6785507202148438 + }, + { + "auxiliary_loss_clip": 0.01148245, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00042057, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.900602199229213, + "language_loss": 0.70589304, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72842205, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.576511859893799 + }, + { + "auxiliary_loss_clip": 0.01165225, + "auxiliary_loss_mlp": 0.0110356, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.0004679, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.643406151189338, + "language_loss": 0.79618752, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81887531, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.546537399291992 + }, + { + "auxiliary_loss_clip": 0.01165524, + "auxiliary_loss_mlp": 0.01105274, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00036955, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.8191899162230787, + "language_loss": 0.54914486, + "learning_rate": 5.503754755413424e-07, + "loss": 0.5718528, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.5702576637268066 + }, + { + "auxiliary_loss_clip": 0.01132365, + "auxiliary_loss_mlp": 0.00747461, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00048709, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.6301707127827139, + "language_loss": 0.77510136, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79389966, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.602046012878418 + }, + { + "auxiliary_loss_clip": 0.01148843, + "auxiliary_loss_mlp": 0.01105388, + "balance_loss_clip": 1.0019908, + "balance_loss_mlp": 1.00067413, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.6547335876281883, + "language_loss": 0.68991119, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71245348, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 3.93880295753479 + }, + { + "auxiliary_loss_clip": 0.01165414, + "auxiliary_loss_mlp": 0.01105046, + "balance_loss_clip": 1.0019275, + "balance_loss_mlp": 1.00061893, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 9.523987809489046, + "language_loss": 0.69865811, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72136277, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 3.933666944503784 + }, + { + "auxiliary_loss_clip": 0.01133559, + "auxiliary_loss_mlp": 0.01104893, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00046551, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.4282325959616242, + "language_loss": 0.782296, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80468053, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 2.6471478939056396 + }, + { + "auxiliary_loss_clip": 0.01150487, + "auxiliary_loss_mlp": 0.01104985, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00065303, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7208236137221307, + "language_loss": 0.77633071, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79888546, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 3.9572482109069824 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01105566, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00047135, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.768676141009334, + "language_loss": 0.73745161, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75984895, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.6146819591522217 + }, + { + "auxiliary_loss_clip": 0.01130443, + "auxiliary_loss_mlp": 0.01104889, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.00055742, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.4329718334828405, + "language_loss": 0.72429633, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74664962, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.01149192, + "auxiliary_loss_mlp": 0.00747619, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.0005095, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 2.3390020238802185, + "language_loss": 0.77906513, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79803324, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.523998737335205 + }, + { + "auxiliary_loss_clip": 0.01134025, + "auxiliary_loss_mlp": 0.01104425, + "balance_loss_clip": 1.00158751, + "balance_loss_mlp": 1.00057006, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.6697618509491008, + "language_loss": 0.76542503, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78780949, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 2.58158540725708 + }, + { + "auxiliary_loss_clip": 0.01134585, + "auxiliary_loss_mlp": 0.01105574, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00047851, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.8679851102588576, + "language_loss": 0.62241077, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64481235, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.662418842315674 + }, + { + "auxiliary_loss_clip": 0.01165333, + "auxiliary_loss_mlp": 0.01105021, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00059319, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 4.226714539123319, + "language_loss": 0.7908963, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81359982, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.4891014099121094 + }, + { + "auxiliary_loss_clip": 0.01150172, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_clip": 1.00183284, + "balance_loss_mlp": 1.00061679, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.85102718498351, + "language_loss": 0.65231264, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67486763, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.5733911991119385 + }, + { + "auxiliary_loss_clip": 0.01133895, + "auxiliary_loss_mlp": 0.01105085, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00046742, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.440323899238218, + "language_loss": 0.76089561, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78328538, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.5660948753356934 + }, + { + "auxiliary_loss_clip": 0.01131306, + "auxiliary_loss_mlp": 0.01103995, + "balance_loss_clip": 1.00180399, + "balance_loss_mlp": 1.0005219, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.310046625181777, + "language_loss": 0.76688236, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78923535, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.58671498298645 + }, + { + "auxiliary_loss_clip": 0.01133827, + "auxiliary_loss_mlp": 0.01104153, + "balance_loss_clip": 1.00168562, + "balance_loss_mlp": 1.00048852, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 2.304108542624765, + "language_loss": 0.7466948, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76907462, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.541349411010742 + }, + { + "auxiliary_loss_clip": 0.01150944, + "auxiliary_loss_mlp": 0.01105065, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00054228, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.04892873482241, + "language_loss": 0.7146076, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73716772, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.5235323905944824 + }, + { + "auxiliary_loss_clip": 0.01150418, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00045085, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.626968883841021, + "language_loss": 0.76494521, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78749627, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.507892370223999 + }, + { + "auxiliary_loss_clip": 0.01113624, + "auxiliary_loss_mlp": 0.01081655, + "balance_loss_clip": 1.00115895, + "balance_loss_mlp": 1.00002003, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6643256842258959, + "language_loss": 0.56796885, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58992171, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.2215590476989746 + }, + { + "auxiliary_loss_clip": 0.01165348, + "auxiliary_loss_mlp": 0.01104252, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00068319, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.5549469423125857, + "language_loss": 0.72190821, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74460423, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.562368869781494 + }, + { + "auxiliary_loss_clip": 0.01119066, + "auxiliary_loss_mlp": 0.01104503, + "balance_loss_clip": 1.00177789, + "balance_loss_mlp": 1.00045705, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 1.7382257523774929, + "language_loss": 0.69555843, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71779418, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.595137596130371 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01104499, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.00045311, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.674362422082184, + "language_loss": 0.73570013, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75823212, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.5885205268859863 + }, + { + "auxiliary_loss_clip": 0.01148439, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00055575, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.1040302806786495, + "language_loss": 0.75451362, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77703547, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.5448477268218994 + }, + { + "auxiliary_loss_clip": 0.01118688, + "auxiliary_loss_mlp": 0.01104681, + "balance_loss_clip": 1.00175381, + "balance_loss_mlp": 1.00063539, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.96729004331051, + "language_loss": 0.61439538, + "learning_rate": 5.442187162761537e-07, + "loss": 0.6366291, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.6635520458221436 + }, + { + "auxiliary_loss_clip": 0.01148755, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00049877, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 2.053513902722085, + "language_loss": 0.69350278, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71604538, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.564700126647949 + }, + { + "auxiliary_loss_clip": 0.01150745, + "auxiliary_loss_mlp": 0.01104466, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.0005151, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 1.8268729507854413, + "language_loss": 0.61999118, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64254332, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.517096519470215 + }, + { + "auxiliary_loss_clip": 0.01165422, + "auxiliary_loss_mlp": 0.01104375, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00051939, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.428572985396831, + "language_loss": 0.79783738, + "learning_rate": 5.434178110152401e-07, + "loss": 0.8205353, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.4864449501037598 + }, + { + "auxiliary_loss_clip": 0.01165227, + "auxiliary_loss_mlp": 0.01104772, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00053585, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 2.4702528221455435, + "language_loss": 0.70143014, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72413015, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.5016140937805176 + }, + { + "auxiliary_loss_clip": 0.01148636, + "auxiliary_loss_mlp": 0.01104505, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00064969, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5228457438099259, + "language_loss": 0.70006508, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72259647, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.577162504196167 + }, + { + "auxiliary_loss_clip": 0.01131905, + "auxiliary_loss_mlp": 0.01103938, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.0006547, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.8385885551778247, + "language_loss": 0.76025653, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78261495, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 3.9887611865997314 + }, + { + "auxiliary_loss_clip": 0.01150046, + "auxiliary_loss_mlp": 0.01104394, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.0005393, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.8255449515058162, + "language_loss": 0.75707132, + "learning_rate": 5.423507106536156e-07, + "loss": 0.77961576, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.5659360885620117 + }, + { + "auxiliary_loss_clip": 0.01133875, + "auxiliary_loss_mlp": 0.0110388, + "balance_loss_clip": 1.00164437, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.2248057538433144, + "language_loss": 0.68306088, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70543838, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.7019317150115967 + }, + { + "auxiliary_loss_clip": 0.01133887, + "auxiliary_loss_mlp": 0.01104554, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00050807, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.5911563943391613, + "language_loss": 0.79096961, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81335402, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.6221344470977783 + }, + { + "auxiliary_loss_clip": 0.01134045, + "auxiliary_loss_mlp": 0.01103698, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.6493328464065842, + "language_loss": 0.66019511, + "learning_rate": 5.415509657261589e-07, + "loss": 0.6825726, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.576143264770508 + }, + { + "auxiliary_loss_clip": 0.01148669, + "auxiliary_loss_mlp": 0.01104906, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00057447, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.5890406094644858, + "language_loss": 0.74017727, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76271307, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.5352001190185547 + }, + { + "auxiliary_loss_clip": 0.01131884, + "auxiliary_loss_mlp": 0.01104617, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.00047565, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.5750058480962879, + "language_loss": 0.70866323, + "learning_rate": 5.410180789470067e-07, + "loss": 0.7310282, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.7012414932250977 + }, + { + "auxiliary_loss_clip": 0.01149817, + "auxiliary_loss_mlp": 0.01103758, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00047493, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.560392623389174, + "language_loss": 0.69329333, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71582901, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.5952553749084473 + }, + { + "auxiliary_loss_clip": 0.01131631, + "auxiliary_loss_mlp": 0.01103227, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00042093, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7309994485801306, + "language_loss": 0.60728359, + "learning_rate": 5.404854134668162e-07, + "loss": 0.62963217, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.591596841812134 + }, + { + "auxiliary_loss_clip": 0.01114014, + "auxiliary_loss_mlp": 0.01082209, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00019288, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7319495437237717, + "language_loss": 0.60800529, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62996757, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.311368465423584 + }, + { + "auxiliary_loss_clip": 0.01132291, + "auxiliary_loss_mlp": 0.01103852, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.00047338, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.6934638330213199, + "language_loss": 0.69451809, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71687949, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.5912392139434814 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01106, + "balance_loss_clip": 1.0020628, + "balance_loss_mlp": 1.00052321, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 2.177843787279049, + "language_loss": 0.70765591, + "learning_rate": 5.3968683035881e-07, + "loss": 0.73022366, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 4.030990839004517 + }, + { + "auxiliary_loss_clip": 0.01148901, + "auxiliary_loss_mlp": 0.01104816, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00048435, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.7851380874163738, + "language_loss": 0.80399448, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82653165, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 3.9146366119384766 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01103804, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00052083, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.584157601013375, + "language_loss": 0.78736174, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80958545, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.7173104286193848 + }, + { + "auxiliary_loss_clip": 0.01165402, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00051022, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.2672540142142195, + "language_loss": 0.68302017, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70571303, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 3.945730686187744 + }, + { + "auxiliary_loss_clip": 0.01150447, + "auxiliary_loss_mlp": 0.01104114, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.9565308066744576, + "language_loss": 0.73197979, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75452542, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.571239948272705 + }, + { + "auxiliary_loss_clip": 0.01119144, + "auxiliary_loss_mlp": 0.01103358, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00045633, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.5630338143895537, + "language_loss": 0.80965602, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83188105, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.6697897911071777 + }, + { + "auxiliary_loss_clip": 0.01150535, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00036216, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.6177493612593719, + "language_loss": 0.69662428, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71560323, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.5517754554748535 + }, + { + "auxiliary_loss_clip": 0.01096361, + "auxiliary_loss_mlp": 0.01080878, + "balance_loss_clip": 1.00117362, + "balance_loss_mlp": 1.00000596, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.6963017960391026, + "language_loss": 0.56818551, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58995789, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.3002686500549316 + }, + { + "auxiliary_loss_clip": 0.0115055, + "auxiliary_loss_mlp": 0.01104998, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.0006659, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.812119695910249, + "language_loss": 0.74017787, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76273328, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.6165363788604736 + }, + { + "auxiliary_loss_clip": 0.01118695, + "auxiliary_loss_mlp": 0.01104152, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.0005827, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.5349691927833353, + "language_loss": 0.70334005, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72556853, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.636378765106201 + }, + { + "auxiliary_loss_clip": 0.01148693, + "auxiliary_loss_mlp": 0.0110455, + "balance_loss_clip": 1.00206637, + "balance_loss_mlp": 1.00059938, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.742126589654476, + "language_loss": 0.70232511, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72485757, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.564568281173706 + }, + { + "auxiliary_loss_clip": 0.01134239, + "auxiliary_loss_mlp": 0.01104793, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00046134, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.5240607678800764, + "language_loss": 0.58655214, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60894251, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 2.9792091846466064 + }, + { + "auxiliary_loss_clip": 0.01148689, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00057995, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.9638422292041335, + "language_loss": 0.67890471, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70144928, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 2.5666491985321045 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01104989, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00056148, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4859902849756026, + "language_loss": 0.79335916, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81539714, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.6832456588745117 + }, + { + "auxiliary_loss_clip": 0.01130388, + "auxiliary_loss_mlp": 0.01104769, + "balance_loss_clip": 1.00196683, + "balance_loss_mlp": 1.00053203, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.9910937879684445, + "language_loss": 0.6678493, + "learning_rate": 5.35966703239153e-07, + "loss": 0.69020087, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 2.7048470973968506 + }, + { + "auxiliary_loss_clip": 0.01133458, + "auxiliary_loss_mlp": 0.01104545, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.00068963, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.811796684037071, + "language_loss": 0.69189143, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71427149, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.578127861022949 + }, + { + "auxiliary_loss_clip": 0.01100596, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_clip": 1.00156355, + "balance_loss_mlp": 1.00038981, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.7789720777779756, + "language_loss": 0.80480242, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82684034, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.661746025085449 + }, + { + "auxiliary_loss_clip": 0.01150456, + "auxiliary_loss_mlp": 0.01105019, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00049639, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5409357935222199, + "language_loss": 0.77496338, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79751813, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 2.7707910537719727 + }, + { + "auxiliary_loss_clip": 0.01165258, + "auxiliary_loss_mlp": 0.01104086, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00051665, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 1.8871856601491008, + "language_loss": 0.58849645, + "learning_rate": 5.349058071544468e-07, + "loss": 0.6111899, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.5960476398468018 + }, + { + "auxiliary_loss_clip": 0.01133835, + "auxiliary_loss_mlp": 0.01104206, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00044608, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.4718855354678182, + "language_loss": 0.75891709, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78129745, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.5827627182006836 + }, + { + "auxiliary_loss_clip": 0.01100053, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00168049, + "balance_loss_mlp": 1.00048757, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 2.2997346234429736, + "language_loss": 0.66733003, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68580472, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.6872975826263428 + }, + { + "auxiliary_loss_clip": 0.01133951, + "auxiliary_loss_mlp": 0.01105166, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00054812, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.9182336527503585, + "language_loss": 0.68841672, + "learning_rate": 5.341107183991553e-07, + "loss": 0.71080786, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.69754958152771 + }, + { + "auxiliary_loss_clip": 0.01131697, + "auxiliary_loss_mlp": 0.01104063, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.00058973, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 2.20329111451366, + "language_loss": 0.68410981, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70646739, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.5901079177856445 + }, + { + "auxiliary_loss_clip": 0.01148163, + "auxiliary_loss_mlp": 0.01104067, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00059307, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.6833328002777024, + "language_loss": 0.79666317, + "learning_rate": 5.335809371455526e-07, + "loss": 0.8191855, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.5356149673461914 + }, + { + "auxiliary_loss_clip": 0.01115741, + "auxiliary_loss_mlp": 0.00747386, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00035679, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.9159398614384173, + "language_loss": 0.72775716, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74638844, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.6729063987731934 + }, + { + "auxiliary_loss_clip": 0.01117676, + "auxiliary_loss_mlp": 0.01105666, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00057101, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.8450496456141703, + "language_loss": 0.63691747, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65915084, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.7734012603759766 + }, + { + "auxiliary_loss_clip": 0.01136167, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.0006355, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.3537609829243444, + "language_loss": 0.76954412, + "learning_rate": 5.327866823409319e-07, + "loss": 0.7919507, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.6284563541412354 + }, + { + "auxiliary_loss_clip": 0.01115382, + "auxiliary_loss_mlp": 0.0110608, + "balance_loss_clip": 1.00177062, + "balance_loss_mlp": 1.00050831, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.54902718425126, + "language_loss": 0.71407092, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73628557, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 4.090196847915649 + }, + { + "auxiliary_loss_clip": 0.01165373, + "auxiliary_loss_mlp": 0.01104862, + "balance_loss_clip": 1.00192785, + "balance_loss_mlp": 1.00043452, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 4.898720319653745, + "language_loss": 0.64330125, + "learning_rate": 5.32257457305499e-07, + "loss": 0.66600358, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.491975784301758 + }, + { + "auxiliary_loss_clip": 0.01117245, + "auxiliary_loss_mlp": 0.01104598, + "balance_loss_clip": 1.00180733, + "balance_loss_mlp": 1.0006479, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 2.197683267730215, + "language_loss": 0.91495979, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93717825, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 2.6442205905914307 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01104709, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00037742, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.7893131468206418, + "language_loss": 0.82773536, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84976387, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.6249444484710693 + }, + { + "auxiliary_loss_clip": 0.01085915, + "auxiliary_loss_mlp": 0.01105517, + "balance_loss_clip": 1.00174129, + "balance_loss_mlp": 1.00042188, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.482029955174073, + "language_loss": 0.78008264, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80199695, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.665984869003296 + }, + { + "auxiliary_loss_clip": 0.01132018, + "auxiliary_loss_mlp": 0.01104843, + "balance_loss_clip": 1.00176442, + "balance_loss_mlp": 1.00051153, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.620051864016486, + "language_loss": 0.83940566, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86177433, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.6069371700286865 + }, + { + "auxiliary_loss_clip": 0.01133326, + "auxiliary_loss_mlp": 0.01103658, + "balance_loss_clip": 1.00170696, + "balance_loss_mlp": 1.00056624, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.8333900262658942, + "language_loss": 0.72237647, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74474633, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.5725550651550293 + }, + { + "auxiliary_loss_clip": 0.01135864, + "auxiliary_loss_mlp": 0.01103988, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.6965975697143076, + "language_loss": 0.75663471, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77903324, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.6148555278778076 + }, + { + "auxiliary_loss_clip": 0.01130062, + "auxiliary_loss_mlp": 0.01081638, + "balance_loss_clip": 1.00124657, + "balance_loss_mlp": 1.0000031, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7372421123769004, + "language_loss": 0.55862385, + "learning_rate": 5.304069234017001e-07, + "loss": 0.58074087, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.1916873455047607 + }, + { + "auxiliary_loss_clip": 0.01130027, + "auxiliary_loss_mlp": 0.01081635, + "balance_loss_clip": 1.00131702, + "balance_loss_mlp": 1.00000024, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.752451005253517, + "language_loss": 0.54003489, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56215143, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.2563188076019287 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01104658, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00042164, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 1.7756302080421706, + "language_loss": 0.72820699, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75041914, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 4.00052809715271 + }, + { + "auxiliary_loss_clip": 0.01133417, + "auxiliary_loss_mlp": 0.01104105, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00053573, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 5.421207935150475, + "language_loss": 0.74910313, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77147835, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.578251838684082 + }, + { + "auxiliary_loss_clip": 0.01148855, + "auxiliary_loss_mlp": 0.01105162, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00054395, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.098042277829246, + "language_loss": 0.80063975, + "learning_rate": 5.293507012327218e-07, + "loss": 0.8231799, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 3.999558687210083 + }, + { + "auxiliary_loss_clip": 0.01149126, + "auxiliary_loss_mlp": 0.01105944, + "balance_loss_clip": 1.00183558, + "balance_loss_mlp": 1.00056255, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.8819763565745817, + "language_loss": 0.79189533, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81444603, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 2.607292890548706 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.0110307, + "balance_loss_clip": 1.00174725, + "balance_loss_mlp": 1.0004549, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.4552797286531587, + "language_loss": 0.70465821, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72686112, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 4.101291656494141 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01105553, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00045848, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.3120988359537034, + "language_loss": 0.78637993, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80876851, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.5428502559661865 + }, + { + "auxiliary_loss_clip": 0.01126209, + "auxiliary_loss_mlp": 0.01081644, + "balance_loss_clip": 1.00133038, + "balance_loss_mlp": 1.00000978, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8081564455663188, + "language_loss": 0.56727809, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58935666, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.2103168964385986 + }, + { + "auxiliary_loss_clip": 0.01113098, + "auxiliary_loss_mlp": 0.01104576, + "balance_loss_clip": 1.0018127, + "balance_loss_mlp": 1.00062525, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.7471120008340564, + "language_loss": 0.72169268, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74386942, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.695772886276245 + }, + { + "auxiliary_loss_clip": 0.01148674, + "auxiliary_loss_mlp": 0.01104794, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00046253, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.702266718195241, + "language_loss": 0.66526318, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68779778, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.566903591156006 + }, + { + "auxiliary_loss_clip": 0.01133929, + "auxiliary_loss_mlp": 0.01103689, + "balance_loss_clip": 1.0017134, + "balance_loss_mlp": 1.00050128, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.8405365255109347, + "language_loss": 0.65528786, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67766404, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.6358797550201416 + }, + { + "auxiliary_loss_clip": 0.01148833, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00043964, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.303340769555424, + "language_loss": 0.64841521, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67095315, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.5670411586761475 + }, + { + "auxiliary_loss_clip": 0.01148713, + "auxiliary_loss_mlp": 0.01103943, + "balance_loss_clip": 1.00188029, + "balance_loss_mlp": 1.00056458, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 5.809365196885492, + "language_loss": 0.71990401, + "learning_rate": 5.26977464707133e-07, + "loss": 0.74243057, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.501852035522461 + }, + { + "auxiliary_loss_clip": 0.01099544, + "auxiliary_loss_mlp": 0.01104378, + "balance_loss_clip": 1.00162148, + "balance_loss_mlp": 1.00052261, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.75096051833178, + "language_loss": 0.61464977, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63668907, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.652979850769043 + }, + { + "auxiliary_loss_clip": 0.01148636, + "auxiliary_loss_mlp": 0.01104095, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00052619, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.629394377932049, + "language_loss": 0.67235351, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69488072, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.5469067096710205 + }, + { + "auxiliary_loss_clip": 0.01165441, + "auxiliary_loss_mlp": 0.01105087, + "balance_loss_clip": 1.0019089, + "balance_loss_mlp": 1.00046897, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.6141378374535158, + "language_loss": 0.5764572, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59916246, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.4916789531707764 + }, + { + "auxiliary_loss_clip": 0.01132298, + "auxiliary_loss_mlp": 0.0110436, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00050545, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 2.2070014340296535, + "language_loss": 0.80843902, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83080554, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.6172475814819336 + }, + { + "auxiliary_loss_clip": 0.01165484, + "auxiliary_loss_mlp": 0.01104698, + "balance_loss_clip": 1.00199962, + "balance_loss_mlp": 1.00065231, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.3075670930593737, + "language_loss": 0.68576312, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70846486, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 2.461644172668457 + }, + { + "auxiliary_loss_clip": 0.01133969, + "auxiliary_loss_mlp": 0.01104527, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00067198, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.8324190672922833, + "language_loss": 0.72217172, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74455667, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.6459202766418457 + }, + { + "auxiliary_loss_clip": 0.01148688, + "auxiliary_loss_mlp": 0.01105902, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00061595, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 5.9518355368383, + "language_loss": 0.7661714, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78871727, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.524068832397461 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01104931, + "balance_loss_clip": 1.00166214, + "balance_loss_mlp": 1.00040865, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 2.49976376814727, + "language_loss": 0.72437066, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74676156, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 2.562610149383545 + }, + { + "auxiliary_loss_clip": 0.01165309, + "auxiliary_loss_mlp": 0.01103289, + "balance_loss_clip": 1.00201142, + "balance_loss_mlp": 1.0007695, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.36092096145591, + "language_loss": 0.73669744, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75938338, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.486443519592285 + }, + { + "auxiliary_loss_clip": 0.01165278, + "auxiliary_loss_mlp": 0.01104533, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00058246, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6012157231025885, + "language_loss": 0.81350553, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83620363, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.6035215854644775 + }, + { + "auxiliary_loss_clip": 0.01160995, + "auxiliary_loss_mlp": 0.01081249, + "balance_loss_clip": 1.00130606, + "balance_loss_mlp": 0.99999601, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8539589280984401, + "language_loss": 0.55158389, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57400632, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.247511625289917 + }, + { + "auxiliary_loss_clip": 0.01116919, + "auxiliary_loss_mlp": 0.01103716, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.0005281, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.6547202952991202, + "language_loss": 0.69454437, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71675068, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.5841915607452393 + }, + { + "auxiliary_loss_clip": 0.01130297, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00060248, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.3250060690145546, + "language_loss": 0.80221039, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82457125, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.545257329940796 + }, + { + "auxiliary_loss_clip": 0.01148634, + "auxiliary_loss_mlp": 0.01104747, + "balance_loss_clip": 1.00174034, + "balance_loss_mlp": 1.00060558, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6537657635489709, + "language_loss": 0.78248137, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80501521, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.575698137283325 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01104389, + "balance_loss_clip": 1.00164235, + "balance_loss_mlp": 1.00043893, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.3580374465992877, + "language_loss": 0.60943079, + "learning_rate": 5.230321283779071e-07, + "loss": 0.631814, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.628873348236084 + }, + { + "auxiliary_loss_clip": 0.01132189, + "auxiliary_loss_mlp": 0.01105264, + "balance_loss_clip": 1.00173128, + "balance_loss_mlp": 1.000646, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.494705281249112, + "language_loss": 0.7954458, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81782031, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.6075387001037598 + }, + { + "auxiliary_loss_clip": 0.01095455, + "auxiliary_loss_mlp": 0.01080987, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00011528, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8404940962723726, + "language_loss": 0.55368859, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57545298, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 4.581002712249756 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01104505, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.0005542, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.0156674150101295, + "language_loss": 0.72750282, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74959004, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.6691832542419434 + }, + { + "auxiliary_loss_clip": 0.01116604, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00044584, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.6307241098030145, + "language_loss": 0.69840991, + "learning_rate": 5.219821655586814e-07, + "loss": 0.71704936, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.6218037605285645 + }, + { + "auxiliary_loss_clip": 0.01133326, + "auxiliary_loss_mlp": 0.01104408, + "balance_loss_clip": 1.00181997, + "balance_loss_mlp": 1.00055254, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.6493782425107468, + "language_loss": 0.59817779, + "learning_rate": 5.217198149454575e-07, + "loss": 0.62055504, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.688098669052124 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.0108213, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00011408, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8569883052333435, + "language_loss": 0.55830276, + "learning_rate": 5.214575203887666e-07, + "loss": 0.58057606, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.082361936569214 + }, + { + "auxiliary_loss_clip": 0.01150141, + "auxiliary_loss_mlp": 0.01104333, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00057304, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.3954402348237678, + "language_loss": 0.69182503, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71436977, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.513029098510742 + }, + { + "auxiliary_loss_clip": 0.01148629, + "auxiliary_loss_mlp": 0.0110415, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00048578, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 2.177304274361527, + "language_loss": 0.79709589, + "learning_rate": 5.209330994847647e-07, + "loss": 0.81962365, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.5559751987457275 + }, + { + "auxiliary_loss_clip": 0.01148773, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00039601, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7807592246988655, + "language_loss": 0.79781032, + "learning_rate": 5.206709731573402e-07, + "loss": 0.8167721, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.547398567199707 + }, + { + "auxiliary_loss_clip": 0.01116662, + "auxiliary_loss_mlp": 0.01104638, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00049686, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.3578408634271888, + "language_loss": 0.76239306, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78460604, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.658764600753784 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.00747538, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.0004046, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 5.851451897292925, + "language_loss": 0.68850458, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70697033, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.691426992416382 + }, + { + "auxiliary_loss_clip": 0.0113386, + "auxiliary_loss_mlp": 0.0110501, + "balance_loss_clip": 1.00166309, + "balance_loss_mlp": 1.00048709, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 1.9043417444990505, + "language_loss": 0.73668778, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75907648, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 3.9885542392730713 + }, + { + "auxiliary_loss_clip": 0.01150362, + "auxiliary_loss_mlp": 0.01104672, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00062656, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.4234157963609437, + "language_loss": 0.71482623, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73737663, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.582181453704834 + }, + { + "auxiliary_loss_clip": 0.0116524, + "auxiliary_loss_mlp": 0.01104322, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00056195, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 2.01857756855314, + "language_loss": 0.64533877, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66803443, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 4.04983925819397 + }, + { + "auxiliary_loss_clip": 0.01146483, + "auxiliary_loss_mlp": 0.00746545, + "balance_loss_clip": 1.00135267, + "balance_loss_mlp": 1.00112271, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.8089834845316739, + "language_loss": 0.61677033, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63570058, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 3.0874745845794678 + }, + { + "auxiliary_loss_clip": 0.01165386, + "auxiliary_loss_mlp": 0.01103958, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00048423, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.6613231170152614, + "language_loss": 0.7901181, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81281149, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 4.004836797714233 + }, + { + "auxiliary_loss_clip": 0.01116694, + "auxiliary_loss_mlp": 0.011055, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00050008, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.9788604744460312, + "language_loss": 0.72700644, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74922836, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.6071887016296387 + }, + { + "auxiliary_loss_clip": 0.01165272, + "auxiliary_loss_mlp": 0.01104641, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00040412, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 2.404740489393942, + "language_loss": 0.78453755, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80723667, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.467991590499878 + }, + { + "auxiliary_loss_clip": 0.01085027, + "auxiliary_loss_mlp": 0.00747331, + "balance_loss_clip": 1.00144196, + "balance_loss_mlp": 1.00029445, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.4652911474727375, + "language_loss": 0.7957375, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81406105, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.8431107997894287 + }, + { + "auxiliary_loss_clip": 0.01150746, + "auxiliary_loss_mlp": 0.01104773, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00044084, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.655415437608125, + "language_loss": 0.73939264, + "learning_rate": 5.177912880970474e-07, + "loss": 0.76194787, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.8331291675567627 + }, + { + "auxiliary_loss_clip": 0.01165207, + "auxiliary_loss_mlp": 0.01103915, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00063205, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.6112767343387213, + "language_loss": 0.82297122, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84566242, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.52036714553833 + }, + { + "auxiliary_loss_clip": 0.01161005, + "auxiliary_loss_mlp": 0.01081221, + "balance_loss_clip": 1.00136304, + "balance_loss_mlp": 0.99996752, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.8044019327493451, + "language_loss": 0.54473567, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56715792, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.1643195152282715 + }, + { + "auxiliary_loss_clip": 0.01148654, + "auxiliary_loss_mlp": 0.01105316, + "balance_loss_clip": 1.00190938, + "balance_loss_mlp": 1.00041175, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.475081202606873, + "language_loss": 0.7137863, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73632598, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.6616406440734863 + }, + { + "auxiliary_loss_clip": 0.01165331, + "auxiliary_loss_mlp": 0.01104334, + "balance_loss_clip": 1.00197613, + "balance_loss_mlp": 1.0004791, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.718716035229166, + "language_loss": 0.67821574, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70091236, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.4836249351501465 + }, + { + "auxiliary_loss_clip": 0.01119367, + "auxiliary_loss_mlp": 0.01104297, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00053728, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.6723290808929485, + "language_loss": 0.78759789, + "learning_rate": 5.164845877686162e-07, + "loss": 0.8098346, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.617176055908203 + }, + { + "auxiliary_loss_clip": 0.0108556, + "auxiliary_loss_mlp": 0.00747318, + "balance_loss_clip": 1.001531, + "balance_loss_mlp": 1.00034297, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.7571866707196906, + "language_loss": 0.78356552, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80189431, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.6915199756622314 + }, + { + "auxiliary_loss_clip": 0.01165481, + "auxiliary_loss_mlp": 0.01104566, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.00042498, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.039689267254843, + "language_loss": 0.77082062, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79352105, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.498427629470825 + }, + { + "auxiliary_loss_clip": 0.01149042, + "auxiliary_loss_mlp": 0.01103792, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.00060439, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.4303879138650197, + "language_loss": 0.6751408, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69766915, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.543104410171509 + }, + { + "auxiliary_loss_clip": 0.01165484, + "auxiliary_loss_mlp": 0.01105487, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00058305, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.48301821295493, + "language_loss": 0.74443108, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76714075, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.4535460472106934 + }, + { + "auxiliary_loss_clip": 0.01148739, + "auxiliary_loss_mlp": 0.01104512, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.6485918173544534, + "language_loss": 0.74627697, + "learning_rate": 5.15179293816405e-07, + "loss": 0.7688095, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.5797016620635986 + }, + { + "auxiliary_loss_clip": 0.0110229, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_clip": 1.00168836, + "balance_loss_mlp": 1.00047481, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.6375897314918397, + "language_loss": 0.83122003, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85328329, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.6958584785461426 + }, + { + "auxiliary_loss_clip": 0.01165357, + "auxiliary_loss_mlp": 0.01104421, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00047076, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.7765760804132609, + "language_loss": 0.73152399, + "learning_rate": 5.146575702980898e-07, + "loss": 0.7542218, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 2.4967455863952637 + }, + { + "auxiliary_loss_clip": 0.01135133, + "auxiliary_loss_mlp": 0.01104384, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00043392, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.6789037068848152, + "language_loss": 0.82370239, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84609759, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.624284029006958 + }, + { + "auxiliary_loss_clip": 0.01165536, + "auxiliary_loss_mlp": 0.01105996, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00051939, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.0547233363519446, + "language_loss": 0.71819746, + "learning_rate": 5.141360720771077e-07, + "loss": 0.74091274, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.4924991130828857 + }, + { + "auxiliary_loss_clip": 0.0110018, + "auxiliary_loss_mlp": 0.00747474, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00041938, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 3.393366157606962, + "language_loss": 0.65220582, + "learning_rate": 5.138754074778371e-07, + "loss": 0.67068231, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.6375648975372314 + }, + { + "auxiliary_loss_clip": 0.01148609, + "auxiliary_loss_mlp": 0.01103968, + "balance_loss_clip": 1.00178635, + "balance_loss_mlp": 1.00058985, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4233730740148898, + "language_loss": 0.70950109, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73202682, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.5481326580047607 + }, + { + "auxiliary_loss_clip": 0.01149929, + "auxiliary_loss_mlp": 0.01104642, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0005964, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.0370516611782015, + "language_loss": 0.7800588, + "learning_rate": 5.133542473511578e-07, + "loss": 0.8026045, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.5225186347961426 + }, + { + "auxiliary_loss_clip": 0.01148852, + "auxiliary_loss_mlp": 0.01103853, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00037932, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.5093199999771254, + "language_loss": 0.73821312, + "learning_rate": 5.130937518435124e-07, + "loss": 0.76074016, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 2.6266613006591797 + }, + { + "auxiliary_loss_clip": 0.01150215, + "auxiliary_loss_mlp": 0.01104192, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.0005281, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 1.7993457186071773, + "language_loss": 0.75935769, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78190184, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.509647846221924 + }, + { + "auxiliary_loss_clip": 0.01133692, + "auxiliary_loss_mlp": 0.01103425, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00052381, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.8895017752772094, + "language_loss": 0.68729514, + "learning_rate": 5.12572929988999e-07, + "loss": 0.70966631, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 3.9859018325805664 + }, + { + "auxiliary_loss_clip": 0.01165279, + "auxiliary_loss_mlp": 0.01104548, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.0586280506958694, + "language_loss": 0.84900177, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87170005, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.483884572982788 + }, + { + "auxiliary_loss_clip": 0.01165362, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00192976, + "balance_loss_mlp": 1.0004909, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.344175326391296, + "language_loss": 0.66062558, + "learning_rate": 5.120523337480174e-07, + "loss": 0.6833303, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.591531753540039 + }, + { + "auxiliary_loss_clip": 0.0110156, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00038886, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.6179903583243165, + "language_loss": 0.62392354, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64598066, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.6849517822265625 + }, + { + "auxiliary_loss_clip": 0.01150625, + "auxiliary_loss_mlp": 0.01104751, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00051451, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 2.095286211122553, + "language_loss": 0.65158093, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67413473, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.5555005073547363 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01103284, + "balance_loss_clip": 1.00161529, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.1474090995037085, + "language_loss": 0.70960212, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73197579, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.5639870166778564 + }, + { + "auxiliary_loss_clip": 0.0111901, + "auxiliary_loss_mlp": 0.01105529, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00052905, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 2.441321662295609, + "language_loss": 0.82839781, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85064316, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.624025821685791 + }, + { + "auxiliary_loss_clip": 0.01131758, + "auxiliary_loss_mlp": 0.01104951, + "balance_loss_clip": 1.00175464, + "balance_loss_mlp": 1.00052333, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 2.199747744710151, + "language_loss": 0.73352289, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75589001, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.545423984527588 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01102929, + "balance_loss_clip": 1.0017302, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 3.211808780654364, + "language_loss": 0.79938674, + "learning_rate": 5.104918994957364e-07, + "loss": 0.8217485, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.62551212310791 + }, + { + "auxiliary_loss_clip": 0.01131813, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.0006156, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.7271755233108543, + "language_loss": 0.70013392, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72248441, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.592924118041992 + }, + { + "auxiliary_loss_clip": 0.01134112, + "auxiliary_loss_mlp": 0.01105713, + "balance_loss_clip": 1.00181472, + "balance_loss_mlp": 1.00061846, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 1.8398059127074957, + "language_loss": 0.84350276, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86590099, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 4.054787635803223 + }, + { + "auxiliary_loss_clip": 0.0111193, + "auxiliary_loss_mlp": 0.01081745, + "balance_loss_clip": 1.0017283, + "balance_loss_mlp": 1.00011086, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7700286036235264, + "language_loss": 0.60437143, + "learning_rate": 5.097124447474858e-07, + "loss": 0.6263082, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 4.610869646072388 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01104717, + "balance_loss_clip": 1.00170219, + "balance_loss_mlp": 1.00048029, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.851975045196009, + "language_loss": 0.71893889, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74099219, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.630009651184082 + }, + { + "auxiliary_loss_clip": 0.01148672, + "auxiliary_loss_mlp": 0.01103987, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00070453, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.6148006108726716, + "language_loss": 0.80956751, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83209413, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 3.928581953048706 + }, + { + "auxiliary_loss_clip": 0.01165251, + "auxiliary_loss_mlp": 0.01103383, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00057721, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.662336671706659, + "language_loss": 0.64317572, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66586208, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.5379040241241455 + }, + { + "auxiliary_loss_clip": 0.01114843, + "auxiliary_loss_mlp": 0.01103414, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.0005126, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 2.1177444336109605, + "language_loss": 0.69571948, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71790206, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.580209255218506 + }, + { + "auxiliary_loss_clip": 0.01149944, + "auxiliary_loss_mlp": 0.01103014, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.00039816, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.6718956242334848, + "language_loss": 0.70859593, + "learning_rate": 5.084144838687275e-07, + "loss": 0.73112547, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.5440735816955566 + }, + { + "auxiliary_loss_clip": 0.01148513, + "auxiliary_loss_mlp": 0.01104526, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00048053, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.7292936074534528, + "language_loss": 0.81556946, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83809984, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 2.5442841053009033 + }, + { + "auxiliary_loss_clip": 0.01117183, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 2.022382305140143, + "language_loss": 0.79436219, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81657863, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.5969090461730957 + }, + { + "auxiliary_loss_clip": 0.01118806, + "auxiliary_loss_mlp": 0.01103371, + "balance_loss_clip": 1.00182247, + "balance_loss_mlp": 1.00037444, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 2.05953553903396, + "language_loss": 0.66117465, + "learning_rate": 5.076363859955932e-07, + "loss": 0.6833964, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.6174416542053223 + }, + { + "auxiliary_loss_clip": 0.0115049, + "auxiliary_loss_mlp": 0.0110423, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00046992, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.398194162608616, + "language_loss": 0.78795189, + "learning_rate": 5.073771332059257e-07, + "loss": 0.81049907, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.5958974361419678 + }, + { + "auxiliary_loss_clip": 0.011488, + "auxiliary_loss_mlp": 0.0110499, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00046694, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.7831165338708501, + "language_loss": 0.67019057, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69272852, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.5189032554626465 + }, + { + "auxiliary_loss_clip": 0.01144955, + "auxiliary_loss_mlp": 0.01081284, + "balance_loss_clip": 1.00132883, + "balance_loss_mlp": 1.00003076, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.810353324642645, + "language_loss": 0.58484328, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60710561, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 3.179875612258911 + }, + { + "auxiliary_loss_clip": 0.01133237, + "auxiliary_loss_mlp": 0.01104362, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.791510748877962, + "language_loss": 0.78229231, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80466831, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.579845428466797 + }, + { + "auxiliary_loss_clip": 0.01117146, + "auxiliary_loss_mlp": 0.01103802, + "balance_loss_clip": 1.00159252, + "balance_loss_mlp": 1.00061452, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.6815975579240638, + "language_loss": 0.67968321, + "learning_rate": 5.063406881496209e-07, + "loss": 0.70189267, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.6079490184783936 + }, + { + "auxiliary_loss_clip": 0.01131497, + "auxiliary_loss_mlp": 0.01104865, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00062799, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.6910927637520818, + "language_loss": 0.68972564, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71208924, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.5762343406677246 + }, + { + "auxiliary_loss_clip": 0.01165352, + "auxiliary_loss_mlp": 0.01105003, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00057554, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 2.148581658853483, + "language_loss": 0.7506572, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77336079, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.522301435470581 + }, + { + "auxiliary_loss_clip": 0.011486, + "auxiliary_loss_mlp": 0.0074755, + "balance_loss_clip": 1.00184059, + "balance_loss_mlp": 1.00043178, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.0473273673253027, + "language_loss": 0.70252502, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72148645, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.518786668777466 + }, + { + "auxiliary_loss_clip": 0.01118697, + "auxiliary_loss_mlp": 0.01105262, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.0005486, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 2.360627327658318, + "language_loss": 0.75030828, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77254784, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.599609851837158 + }, + { + "auxiliary_loss_clip": 0.01148812, + "auxiliary_loss_mlp": 0.01104143, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00066996, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.564665290749763, + "language_loss": 0.77644742, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79897702, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.593902111053467 + }, + { + "auxiliary_loss_clip": 0.01148728, + "auxiliary_loss_mlp": 0.0110495, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00061798, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.4505857690987745, + "language_loss": 0.76931775, + "learning_rate": 5.047877199527666e-07, + "loss": 0.7918545, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.59552001953125 + }, + { + "auxiliary_loss_clip": 0.01150135, + "auxiliary_loss_mlp": 0.01104364, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.00041342, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.5863648281561138, + "language_loss": 0.73092133, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75346631, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 2.5671658515930176 + }, + { + "auxiliary_loss_clip": 0.01131986, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00034714, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 3.006916994648592, + "language_loss": 0.75812763, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78049618, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.587148904800415 + }, + { + "auxiliary_loss_clip": 0.01165206, + "auxiliary_loss_mlp": 0.01103834, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00045538, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.229021325111796, + "language_loss": 0.68031961, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70301002, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.534498691558838 + }, + { + "auxiliary_loss_clip": 0.01148554, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00039268, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 2.2769330003228947, + "language_loss": 0.67407441, + "learning_rate": 5.037535416626459e-07, + "loss": 0.6930337, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.6069445610046387 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01104987, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00065506, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 1.9376259219134164, + "language_loss": 0.81339192, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83563489, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.5970826148986816 + }, + { + "auxiliary_loss_clip": 0.0115049, + "auxiliary_loss_mlp": 0.0110335, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00054359, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.114355879307793, + "language_loss": 0.6740725, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69661093, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.5102105140686035 + }, + { + "auxiliary_loss_clip": 0.01117284, + "auxiliary_loss_mlp": 0.0110416, + "balance_loss_clip": 1.00171304, + "balance_loss_mlp": 1.00068593, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.6034186806846416, + "language_loss": 0.70427734, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72649181, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 2.621206760406494 + }, + { + "auxiliary_loss_clip": 0.01148083, + "auxiliary_loss_mlp": 0.01104014, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00063586, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 2.010815276302285, + "language_loss": 0.67800683, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70052779, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 3.9997453689575195 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_clip": 1.00174832, + "balance_loss_mlp": 1.00076246, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 2.6030122232179616, + "language_loss": 0.71927661, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74133503, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.7504541873931885 + }, + { + "auxiliary_loss_clip": 0.01165406, + "auxiliary_loss_mlp": 0.00747417, + "balance_loss_clip": 1.00199628, + "balance_loss_mlp": 1.00047517, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.2321265501230094, + "language_loss": 0.62760127, + "learning_rate": 5.022039765577836e-07, + "loss": 0.64672947, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.5160586833953857 + }, + { + "auxiliary_loss_clip": 0.01124003, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_clip": 1.00134408, + "balance_loss_mlp": 0.99997371, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.770762665904521, + "language_loss": 0.53249574, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55454803, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.2537918090820312 + }, + { + "auxiliary_loss_clip": 0.01131944, + "auxiliary_loss_mlp": 0.01104421, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00056601, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.5528965084825812, + "language_loss": 0.62174004, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64410371, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.6418557167053223 + }, + { + "auxiliary_loss_clip": 0.01133917, + "auxiliary_loss_mlp": 0.01104898, + "balance_loss_clip": 1.00177217, + "balance_loss_mlp": 1.00056624, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.7102984345326229, + "language_loss": 0.82121378, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84360194, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.6179635524749756 + }, + { + "auxiliary_loss_clip": 0.01148902, + "auxiliary_loss_mlp": 0.01105026, + "balance_loss_clip": 1.00164831, + "balance_loss_mlp": 1.0005033, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.8199799786940476, + "language_loss": 0.74862206, + "learning_rate": 5.011720689554603e-07, + "loss": 0.77116132, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.579329490661621 + }, + { + "auxiliary_loss_clip": 0.01086673, + "auxiliary_loss_mlp": 0.01104578, + "balance_loss_clip": 1.00164247, + "balance_loss_mlp": 1.00053239, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.4424586588330965, + "language_loss": 0.65587521, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67778772, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 2.9872231483459473 + }, + { + "auxiliary_loss_clip": 0.01149977, + "auxiliary_loss_mlp": 0.0110492, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00049233, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.460517281497641, + "language_loss": 0.64446217, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66701114, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.6075661182403564 + }, + { + "auxiliary_loss_clip": 0.01165369, + "auxiliary_loss_mlp": 0.01103988, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00070548, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.7108514746517764, + "language_loss": 0.73408705, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75678062, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 2.494241237640381 + }, + { + "auxiliary_loss_clip": 0.01100345, + "auxiliary_loss_mlp": 0.01105505, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00050521, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 1.928287524497393, + "language_loss": 0.78868943, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81074792, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 4.0547239780426025 + }, + { + "auxiliary_loss_clip": 0.01148731, + "auxiliary_loss_mlp": 0.01104358, + "balance_loss_clip": 1.00197852, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.7358263135590601, + "language_loss": 0.70835888, + "learning_rate": 4.998834633291829e-07, + "loss": 0.7308898, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 4.017602205276489 + }, + { + "auxiliary_loss_clip": 0.01149176, + "auxiliary_loss_mlp": 0.01105249, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00053513, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.6521533896947689, + "language_loss": 0.75884545, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78138971, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.551044225692749 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01104934, + "balance_loss_clip": 1.00181973, + "balance_loss_mlp": 1.00060236, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.6603793116339656, + "language_loss": 0.80249459, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82457137, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 4.022570371627808 + }, + { + "auxiliary_loss_clip": 0.01115571, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_clip": 1.00174367, + "balance_loss_mlp": 1.00057876, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.848986236700854, + "language_loss": 0.92163491, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94383788, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.6458420753479004 + }, + { + "auxiliary_loss_clip": 0.01148516, + "auxiliary_loss_mlp": 0.01104003, + "balance_loss_clip": 1.00167048, + "balance_loss_mlp": 1.00043416, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 2.3283179993756535, + "language_loss": 0.66130942, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68383467, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.6068179607391357 + }, + { + "auxiliary_loss_clip": 0.01115324, + "auxiliary_loss_mlp": 0.01104454, + "balance_loss_clip": 1.0017308, + "balance_loss_mlp": 1.00059867, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.837194634385418, + "language_loss": 0.7218349, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74403274, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 2.6450107097625732 + }, + { + "auxiliary_loss_clip": 0.01149034, + "auxiliary_loss_mlp": 0.01105105, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00039172, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.6285576502245573, + "language_loss": 0.65807414, + "learning_rate": 4.983390138757027e-07, + "loss": 0.68061554, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.579899311065674 + }, + { + "auxiliary_loss_clip": 0.01131937, + "auxiliary_loss_mlp": 0.01105778, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.0005877, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.7320399509281441, + "language_loss": 0.72449476, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74687189, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.646777391433716 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01104268, + "balance_loss_clip": 1.00150478, + "balance_loss_mlp": 1.00050807, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.5828531853029384, + "language_loss": 0.74304783, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76510859, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 2.6761422157287598 + }, + { + "auxiliary_loss_clip": 0.01115103, + "auxiliary_loss_mlp": 0.01105523, + "balance_loss_clip": 1.00176859, + "balance_loss_mlp": 1.00052357, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 2.2569214757244582, + "language_loss": 0.77668822, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79889446, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.6316616535186768 + }, + { + "auxiliary_loss_clip": 0.01165373, + "auxiliary_loss_mlp": 0.01104742, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00050509, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.7329928851775545, + "language_loss": 0.79430771, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81700885, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.4936177730560303 + }, + { + "auxiliary_loss_clip": 0.01112671, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_clip": 1.00270367, + "balance_loss_mlp": 1.00004697, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.840689998340116, + "language_loss": 0.59802896, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61997628, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.1623618602752686 + }, + { + "auxiliary_loss_clip": 0.01148668, + "auxiliary_loss_mlp": 0.01104341, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00048554, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.3700859711373472, + "language_loss": 0.76160979, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78413987, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.6250579357147217 + }, + { + "auxiliary_loss_clip": 0.01131724, + "auxiliary_loss_mlp": 0.01105169, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.000646, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 3.9055798487812523, + "language_loss": 0.73347437, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75584328, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 2.6065897941589355 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01104851, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00051939, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.8456097415194557, + "language_loss": 0.70270216, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72477323, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.6585893630981445 + }, + { + "auxiliary_loss_clip": 0.01133956, + "auxiliary_loss_mlp": 0.00747454, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00043356, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.5740381541694926, + "language_loss": 0.8394978, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85831189, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.591247320175171 + }, + { + "auxiliary_loss_clip": 0.01150688, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00037837, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.345167814739477, + "language_loss": 0.68194139, + "learning_rate": 4.957694879434397e-07, + "loss": 0.70449054, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.521296739578247 + }, + { + "auxiliary_loss_clip": 0.01165335, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.0005765, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.4487185702588854, + "language_loss": 0.87522888, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89793134, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.511115550994873 + }, + { + "auxiliary_loss_clip": 0.01148645, + "auxiliary_loss_mlp": 0.01104908, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.0005765, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.9388082332196637, + "language_loss": 0.85105813, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87359369, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.5420384407043457 + }, + { + "auxiliary_loss_clip": 0.01165208, + "auxiliary_loss_mlp": 0.00747336, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00039089, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.6810380266119052, + "language_loss": 0.68823242, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70735788, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.508075714111328 + }, + { + "auxiliary_loss_clip": 0.01118063, + "auxiliary_loss_mlp": 0.01104127, + "balance_loss_clip": 1.00161409, + "balance_loss_mlp": 1.00046229, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.4976097073574768, + "language_loss": 0.776645, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79886687, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 2.6486449241638184 + }, + { + "auxiliary_loss_clip": 0.01150663, + "auxiliary_loss_mlp": 0.01105455, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00055122, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.101360148819122, + "language_loss": 0.729321, + "learning_rate": 4.944868633899462e-07, + "loss": 0.7518822, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.6123595237731934 + }, + { + "auxiliary_loss_clip": 0.01100405, + "auxiliary_loss_mlp": 0.01103715, + "balance_loss_clip": 1.00167418, + "balance_loss_mlp": 1.00052762, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 2.5878040342838147, + "language_loss": 0.67823946, + "learning_rate": 4.942305097079751e-07, + "loss": 0.70028067, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 2.668992757797241 + }, + { + "auxiliary_loss_clip": 0.01126154, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_clip": 1.00132918, + "balance_loss_mlp": 0.99998951, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7747074231980715, + "language_loss": 0.58570898, + "learning_rate": 4.939742131249347e-07, + "loss": 0.6077829, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.295229434967041 + }, + { + "auxiliary_loss_clip": 0.01165478, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.0006355, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 1.8164715622831804, + "language_loss": 0.6737324, + "learning_rate": 4.937179736505428e-07, + "loss": 0.6964426, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.4944798946380615 + }, + { + "auxiliary_loss_clip": 0.01150745, + "auxiliary_loss_mlp": 0.01104961, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00053334, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.8234911116514145, + "language_loss": 0.69297266, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71552974, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.5949370861053467 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01104931, + "balance_loss_clip": 1.00199151, + "balance_loss_mlp": 1.0005033, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.7015714133551987, + "language_loss": 0.65241241, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67511582, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 4.0540452003479 + }, + { + "auxiliary_loss_clip": 0.01087303, + "auxiliary_loss_mlp": 0.01104288, + "balance_loss_clip": 1.00166249, + "balance_loss_mlp": 1.00052822, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.18194453825605, + "language_loss": 0.65394002, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67585599, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 2.6922149658203125 + }, + { + "auxiliary_loss_clip": 0.0116534, + "auxiliary_loss_mlp": 0.0110455, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00059903, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.7424340500501057, + "language_loss": 0.75579166, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77849054, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.4752020835876465 + }, + { + "auxiliary_loss_clip": 0.0116546, + "auxiliary_loss_mlp": 0.01105962, + "balance_loss_clip": 1.00201595, + "balance_loss_mlp": 1.00058103, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 2.2675482453974016, + "language_loss": 0.68925226, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71196645, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.5030553340911865 + }, + { + "auxiliary_loss_clip": 0.01148929, + "auxiliary_loss_mlp": 0.01105444, + "balance_loss_clip": 1.00184977, + "balance_loss_mlp": 1.00044489, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.658639243551934, + "language_loss": 0.72270787, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74525166, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.584998846054077 + }, + { + "auxiliary_loss_clip": 0.01133978, + "auxiliary_loss_mlp": 0.0110397, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00059164, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.727705586896823, + "language_loss": 0.65573561, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67811507, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.6234381198883057 + }, + { + "auxiliary_loss_clip": 0.01131713, + "auxiliary_loss_mlp": 0.01102838, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.0005089, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.5175883674724482, + "language_loss": 0.81273872, + "learning_rate": 4.916701149323022e-07, + "loss": 0.8350842, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.5972490310668945 + }, + { + "auxiliary_loss_clip": 0.01165534, + "auxiliary_loss_mlp": 0.01104258, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00059342, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 1.9500857239330145, + "language_loss": 0.76431763, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78701556, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.4781124591827393 + }, + { + "auxiliary_loss_clip": 0.01150676, + "auxiliary_loss_mlp": 0.01104152, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00048733, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.5936966423300962, + "language_loss": 0.72705448, + "learning_rate": 4.911587220188905e-07, + "loss": 0.74960274, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.528407573699951 + }, + { + "auxiliary_loss_clip": 0.01134452, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00070167, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.3773421699421458, + "language_loss": 0.68844903, + "learning_rate": 4.909031113804551e-07, + "loss": 0.71084577, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.571143865585327 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.011041, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.0005312, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.575183882499039, + "language_loss": 0.76173222, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78393924, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.656304121017456 + }, + { + "auxiliary_loss_clip": 0.01068616, + "auxiliary_loss_mlp": 0.01103988, + "balance_loss_clip": 1.00166535, + "balance_loss_mlp": 1.00051427, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.6402297420014078, + "language_loss": 0.77076823, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79249424, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 5.584977626800537 + }, + { + "auxiliary_loss_clip": 0.01150376, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00067294, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.9542017778667886, + "language_loss": 0.71687627, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73943293, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 2.5144202709198 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00035787, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.5626419413639996, + "language_loss": 0.77986836, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79880649, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.5658068656921387 + }, + { + "auxiliary_loss_clip": 0.01150679, + "auxiliary_loss_mlp": 0.01105713, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00071359, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.7513275656619385, + "language_loss": 0.75450909, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77707309, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 4.006548166275024 + }, + { + "auxiliary_loss_clip": 0.01133976, + "auxiliary_loss_mlp": 0.01103816, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00062799, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.8556726221181328, + "language_loss": 0.73457325, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75695121, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.6330652236938477 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01103693, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00050497, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.6586401766182017, + "language_loss": 0.698421, + "learning_rate": 4.891154397568795e-07, + "loss": 0.72095364, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.543064832687378 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00188303, + "balance_loss_mlp": 1.0003736, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.6006927627897076, + "language_loss": 0.63847494, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65743256, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.603489637374878 + }, + { + "auxiliary_loss_clip": 0.01132862, + "auxiliary_loss_mlp": 0.01103828, + "balance_loss_clip": 1.00173998, + "balance_loss_mlp": 1.00054467, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.67612022817946, + "language_loss": 0.76939058, + "learning_rate": 4.88605191926694e-07, + "loss": 0.79175752, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.622863531112671 + }, + { + "auxiliary_loss_clip": 0.01150165, + "auxiliary_loss_mlp": 0.01103216, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00050521, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.4391961469843535, + "language_loss": 0.72674477, + "learning_rate": 4.883501539751289e-07, + "loss": 0.7492786, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.5846219062805176 + }, + { + "auxiliary_loss_clip": 0.01133103, + "auxiliary_loss_mlp": 0.00747282, + "balance_loss_clip": 1.00203907, + "balance_loss_mlp": 1.00044155, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 3.566652659913095, + "language_loss": 0.74345338, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76225722, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.6022729873657227 + }, + { + "auxiliary_loss_clip": 0.01165323, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00058651, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.659678823129007, + "language_loss": 0.71669781, + "learning_rate": 4.878402500474073e-07, + "loss": 0.73939353, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.5176925659179688 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01104281, + "balance_loss_clip": 1.00165391, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.8084061082265255, + "language_loss": 0.60872579, + "learning_rate": 4.875853840905874e-07, + "loss": 0.63110775, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.5712099075317383 + }, + { + "auxiliary_loss_clip": 0.01148683, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00057483, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.7175542370772046, + "language_loss": 0.70058364, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72309947, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.5437090396881104 + }, + { + "auxiliary_loss_clip": 0.01113326, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00040579, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.5867156982653945, + "language_loss": 0.72222984, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74083722, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.757205009460449 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00065637, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.735991186771018, + "language_loss": 0.74114799, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76337516, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.6336820125579834 + }, + { + "auxiliary_loss_clip": 0.01165447, + "auxiliary_loss_mlp": 0.01105015, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00049245, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 2.1931110917807186, + "language_loss": 0.71400279, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73670739, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.5098557472229004 + }, + { + "auxiliary_loss_clip": 0.01148769, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00050282, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 1.6978757403708882, + "language_loss": 0.77338678, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79591423, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.5482122898101807 + }, + { + "auxiliary_loss_clip": 0.01116706, + "auxiliary_loss_mlp": 0.01103806, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00052273, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.6112590757198242, + "language_loss": 0.69095159, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71315664, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.612185001373291 + }, + { + "auxiliary_loss_clip": 0.01116642, + "auxiliary_loss_mlp": 0.01103501, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00040865, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.7812590251190512, + "language_loss": 0.81968713, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84188861, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.6118576526641846 + }, + { + "auxiliary_loss_clip": 0.01133411, + "auxiliary_loss_mlp": 0.0074741, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00034761, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.3614634629574531, + "language_loss": 0.65972543, + "learning_rate": 4.85548521880289e-07, + "loss": 0.67853367, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.6691789627075195 + }, + { + "auxiliary_loss_clip": 0.0113294, + "auxiliary_loss_mlp": 0.01104098, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00043344, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.4531216149359572, + "language_loss": 0.74653709, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76890755, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.663072109222412 + }, + { + "auxiliary_loss_clip": 0.01133733, + "auxiliary_loss_mlp": 0.01105651, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00046074, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 1.67502491718485, + "language_loss": 0.62132597, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64371979, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.6167633533477783 + }, + { + "auxiliary_loss_clip": 0.01165261, + "auxiliary_loss_mlp": 0.01104298, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00044334, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 2.005240771644851, + "language_loss": 0.76673877, + "learning_rate": 4.847856458505217e-07, + "loss": 0.78943431, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.5519840717315674 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.01105333, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00071442, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.170119053658433, + "language_loss": 0.77811867, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80082583, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.5310161113739014 + }, + { + "auxiliary_loss_clip": 0.01115073, + "auxiliary_loss_mlp": 0.01103565, + "balance_loss_clip": 1.00204504, + "balance_loss_mlp": 1.00056851, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.6920800014743496, + "language_loss": 0.73059154, + "learning_rate": 4.842773491000067e-07, + "loss": 0.75277793, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.6387104988098145 + }, + { + "auxiliary_loss_clip": 0.01134219, + "auxiliary_loss_mlp": 0.01104702, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00046587, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.4597404444527453, + "language_loss": 0.73123848, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75362766, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.6116580963134766 + }, + { + "auxiliary_loss_clip": 0.01132119, + "auxiliary_loss_mlp": 0.01105091, + "balance_loss_clip": 1.00168014, + "balance_loss_mlp": 1.00047338, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 2.0309876485505174, + "language_loss": 0.74620599, + "learning_rate": 4.837692822549086e-07, + "loss": 0.76857805, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.556856155395508 + }, + { + "auxiliary_loss_clip": 0.01135967, + "auxiliary_loss_mlp": 0.01104386, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00062585, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 2.7234067523879446, + "language_loss": 0.8121174, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83452094, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 4.03104567527771 + }, + { + "auxiliary_loss_clip": 0.01133415, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.000561, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.5550201608761371, + "language_loss": 0.77022457, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79259902, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.595621347427368 + }, + { + "auxiliary_loss_clip": 0.01150259, + "auxiliary_loss_mlp": 0.01104988, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.000561, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.6690792905670058, + "language_loss": 0.74454856, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76710105, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 2.6415393352508545 + }, + { + "auxiliary_loss_clip": 0.0114424, + "auxiliary_loss_mlp": 0.01081253, + "balance_loss_clip": 1.00130248, + "balance_loss_mlp": 0.99999952, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7294754285547937, + "language_loss": 0.55074191, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57299685, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 3.1433944702148438 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.01104118, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00054932, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.7775051016545067, + "language_loss": 0.8150419, + "learning_rate": 4.82500121484009e-07, + "loss": 0.8373971, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.55147123336792 + }, + { + "auxiliary_loss_clip": 0.01116547, + "auxiliary_loss_mlp": 0.01103235, + "balance_loss_clip": 1.00169659, + "balance_loss_mlp": 1.00042903, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.4965861608953797, + "language_loss": 0.70427161, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72646946, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.6556782722473145 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.0003705, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.9581851383893256, + "language_loss": 0.77572918, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79810703, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.5755465030670166 + }, + { + "auxiliary_loss_clip": 0.01117858, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_clip": 1.00154722, + "balance_loss_mlp": 1.00062454, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.7931646392646123, + "language_loss": 0.65854758, + "learning_rate": 4.817393154694398e-07, + "loss": 0.6807757, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.8427834510803223 + }, + { + "auxiliary_loss_clip": 0.01165515, + "auxiliary_loss_mlp": 0.01104419, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00046825, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.7414474628410836, + "language_loss": 0.6177398, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64043915, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.01133838, + "auxiliary_loss_mlp": 0.01104094, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00042915, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.8381947260825529, + "language_loss": 0.68303835, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70541763, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.6096861362457275 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01103986, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.8202336039978697, + "language_loss": 0.69018984, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71288347, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.508225917816162 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00036645, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.7620009794007614, + "language_loss": 0.75044316, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77260244, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 5.518909454345703 + }, + { + "auxiliary_loss_clip": 0.01165549, + "auxiliary_loss_mlp": 0.01105867, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00058126, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.273597877404602, + "language_loss": 0.68325651, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70597064, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.4822609424591064 + }, + { + "auxiliary_loss_clip": 0.01165401, + "auxiliary_loss_mlp": 0.01105201, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00048804, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.607918315635349, + "language_loss": 0.8264842, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84919024, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 3.948981285095215 + }, + { + "auxiliary_loss_clip": 0.01136145, + "auxiliary_loss_mlp": 0.01105148, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00053048, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.844532028123606, + "language_loss": 0.74690539, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76931834, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.5692970752716064 + }, + { + "auxiliary_loss_clip": 0.01150525, + "auxiliary_loss_mlp": 0.01104975, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00054777, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.4984990893125467, + "language_loss": 0.8459692, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86852425, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.5526773929595947 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01105062, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00044394, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 2.080902356928894, + "language_loss": 0.66091681, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68345809, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.5439510345458984 + }, + { + "auxiliary_loss_clip": 0.01119436, + "auxiliary_loss_mlp": 0.01104743, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00060153, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.5470359384075143, + "language_loss": 0.66650522, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68874705, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.01150349, + "auxiliary_loss_mlp": 0.0110644, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00067782, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 3.658899911513434, + "language_loss": 0.73097807, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75354588, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.5598325729370117 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01104458, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00060272, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.5916139411488506, + "language_loss": 0.62134928, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64390171, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.538835287094116 + }, + { + "auxiliary_loss_clip": 0.01165118, + "auxiliary_loss_mlp": 0.01103052, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00053263, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 1.8276960983727624, + "language_loss": 0.82694459, + "learning_rate": 4.784484802864403e-07, + "loss": 0.8496263, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.568453550338745 + }, + { + "auxiliary_loss_clip": 0.01117256, + "auxiliary_loss_mlp": 0.00747461, + "balance_loss_clip": 1.00173199, + "balance_loss_mlp": 1.00042844, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.8540439771139732, + "language_loss": 0.72599983, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74464703, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.6529245376586914 + }, + { + "auxiliary_loss_clip": 0.01148649, + "auxiliary_loss_mlp": 0.00747564, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00043011, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.5162412440672173, + "language_loss": 0.72115088, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74011302, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.564077138900757 + }, + { + "auxiliary_loss_clip": 0.011653, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_clip": 1.00184333, + "balance_loss_mlp": 1.00051033, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.7179767812105815, + "language_loss": 0.69051206, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71321344, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.5096170902252197 + }, + { + "auxiliary_loss_clip": 0.01134288, + "auxiliary_loss_mlp": 0.01105357, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00035787, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.6358383364381635, + "language_loss": 0.69742674, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71982324, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.6384341716766357 + }, + { + "auxiliary_loss_clip": 0.01117189, + "auxiliary_loss_mlp": 0.01103849, + "balance_loss_clip": 1.00165451, + "balance_loss_mlp": 1.00047064, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.920996301383926, + "language_loss": 0.81310868, + "learning_rate": 4.771853696779586e-07, + "loss": 0.8353191, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.6297616958618164 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00049376, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.5083874613402473, + "language_loss": 0.61957717, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64212, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.6319704055786133 + }, + { + "auxiliary_loss_clip": 0.01148578, + "auxiliary_loss_mlp": 0.01103661, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00037813, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.764269598159339, + "language_loss": 0.70043224, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72295463, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.5687716007232666 + }, + { + "auxiliary_loss_clip": 0.01160835, + "auxiliary_loss_mlp": 0.01080832, + "balance_loss_clip": 1.001261, + "balance_loss_mlp": 0.99996012, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7049352732096663, + "language_loss": 0.55032194, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57273853, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.181295871734619 + }, + { + "auxiliary_loss_clip": 0.0113436, + "auxiliary_loss_mlp": 0.01106524, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.0006659, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 2.1164285918583734, + "language_loss": 0.65482485, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67723364, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.5928189754486084 + }, + { + "auxiliary_loss_clip": 0.01127871, + "auxiliary_loss_mlp": 0.0108085, + "balance_loss_clip": 1.0015229, + "balance_loss_mlp": 0.99997884, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7272774825145042, + "language_loss": 0.5846324, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60671961, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.2294745445251465 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01104425, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00047398, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.8655868746444402, + "language_loss": 0.7413246, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76368046, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.57597017288208 + }, + { + "auxiliary_loss_clip": 0.01165376, + "auxiliary_loss_mlp": 0.01105654, + "balance_loss_clip": 1.00195229, + "balance_loss_mlp": 1.00055909, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.6999991995568626, + "language_loss": 0.74748445, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77019477, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.4977943897247314 + }, + { + "auxiliary_loss_clip": 0.0113134, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_clip": 1.00178397, + "balance_loss_mlp": 1.00059319, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 1.9420611399257348, + "language_loss": 0.75531197, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77767938, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 2.5519280433654785 + }, + { + "auxiliary_loss_clip": 0.0116533, + "auxiliary_loss_mlp": 0.01104454, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00050354, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.428197896540536, + "language_loss": 0.77477753, + "learning_rate": 4.749154093390708e-07, + "loss": 0.7974754, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.5101335048675537 + }, + { + "auxiliary_loss_clip": 0.01100307, + "auxiliary_loss_mlp": 0.01104028, + "balance_loss_clip": 1.00159943, + "balance_loss_mlp": 1.00036335, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.4420893742786418, + "language_loss": 0.67504108, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69708443, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.722505807876587 + }, + { + "auxiliary_loss_clip": 0.01148598, + "auxiliary_loss_mlp": 0.01105225, + "balance_loss_clip": 1.00197291, + "balance_loss_mlp": 1.00060678, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.192067790970488, + "language_loss": 0.62885553, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.65139377, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.5485405921936035 + }, + { + "auxiliary_loss_clip": 0.0116523, + "auxiliary_loss_mlp": 0.01104058, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00067925, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6296634118203706, + "language_loss": 0.69343537, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71612829, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.01092458, + "auxiliary_loss_mlp": 0.01080506, + "balance_loss_clip": 1.00155354, + "balance_loss_mlp": 1.00001609, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6423782958182772, + "language_loss": 0.56188595, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58361566, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 4.931185245513916 + }, + { + "auxiliary_loss_clip": 0.01135549, + "auxiliary_loss_mlp": 0.01103918, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.00044382, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.6155336570146066, + "language_loss": 0.6720562, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69445086, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.7511563301086426 + }, + { + "auxiliary_loss_clip": 0.01165438, + "auxiliary_loss_mlp": 0.01105778, + "balance_loss_clip": 1.0019753, + "balance_loss_mlp": 1.00049257, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.6852531007340545, + "language_loss": 0.77898961, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80170178, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.5966238975524902 + }, + { + "auxiliary_loss_clip": 0.01133881, + "auxiliary_loss_mlp": 0.01103735, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00054753, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.6616730935028965, + "language_loss": 0.78434741, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80672354, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.6199307441711426 + }, + { + "auxiliary_loss_clip": 0.011486, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.0004251, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.9750963110397268, + "language_loss": 0.75479591, + "learning_rate": 4.729015991306715e-07, + "loss": 0.7773152, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.53161358833313 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01104153, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00058424, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.6602249807809815, + "language_loss": 0.70513499, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72767782, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.5485291481018066 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.0110482, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.0006789, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.5876290685882366, + "language_loss": 0.68431753, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.70638555, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.634397506713867 + }, + { + "auxiliary_loss_clip": 0.01118986, + "auxiliary_loss_mlp": 0.01106445, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00058758, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.7586115151649828, + "language_loss": 0.81030214, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83255649, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.6930882930755615 + }, + { + "auxiliary_loss_clip": 0.01150591, + "auxiliary_loss_mlp": 0.01105305, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.693648870958669, + "language_loss": 0.70740986, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72996879, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.624016523361206 + }, + { + "auxiliary_loss_clip": 0.01116968, + "auxiliary_loss_mlp": 0.0110429, + "balance_loss_clip": 1.00180852, + "balance_loss_mlp": 1.00043488, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.7418377565566956, + "language_loss": 0.78675616, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80896878, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.586754560470581 + }, + { + "auxiliary_loss_clip": 0.01150579, + "auxiliary_loss_mlp": 0.01105604, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.0006994, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 1.8341102597903791, + "language_loss": 0.63025558, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65281743, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 3.9193711280822754 + }, + { + "auxiliary_loss_clip": 0.0114851, + "auxiliary_loss_mlp": 0.01104514, + "balance_loss_clip": 1.00181425, + "balance_loss_mlp": 1.00056314, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.4684522121181314, + "language_loss": 0.71769929, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74022949, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 3.9249625205993652 + }, + { + "auxiliary_loss_clip": 0.01165444, + "auxiliary_loss_mlp": 0.00747502, + "balance_loss_clip": 1.00196946, + "balance_loss_mlp": 1.00036454, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.7985117115023312, + "language_loss": 0.72037548, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.73950493, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.502774238586426 + }, + { + "auxiliary_loss_clip": 0.01165417, + "auxiliary_loss_mlp": 0.01105375, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00066137, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.7711302137126932, + "language_loss": 0.66048539, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68319327, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 3.971980333328247 + }, + { + "auxiliary_loss_clip": 0.01148801, + "auxiliary_loss_mlp": 0.01105772, + "balance_loss_clip": 1.00195515, + "balance_loss_mlp": 1.0005815, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.5250214549455987, + "language_loss": 0.73088264, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75342834, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.558448076248169 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01104424, + "balance_loss_clip": 1.00174356, + "balance_loss_mlp": 1.00066435, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.1142971854543555, + "language_loss": 0.59653401, + "learning_rate": 4.701386624460717e-07, + "loss": 0.61876714, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.612250566482544 + }, + { + "auxiliary_loss_clip": 0.0113393, + "auxiliary_loss_mlp": 0.0110461, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00056374, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.5015584359744079, + "language_loss": 0.68192524, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70431066, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.692159414291382 + }, + { + "auxiliary_loss_clip": 0.01115118, + "auxiliary_loss_mlp": 0.01103444, + "balance_loss_clip": 1.00163126, + "balance_loss_mlp": 1.00044727, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.7538630935336321, + "language_loss": 0.69392014, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71610576, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.6848390102386475 + }, + { + "auxiliary_loss_clip": 0.01101789, + "auxiliary_loss_mlp": 0.0110414, + "balance_loss_clip": 1.00160837, + "balance_loss_mlp": 1.00057101, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.4124719186184007, + "language_loss": 0.67440355, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69646275, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.682993173599243 + }, + { + "auxiliary_loss_clip": 0.01144899, + "auxiliary_loss_mlp": 0.00746453, + "balance_loss_clip": 1.00134647, + "balance_loss_mlp": 1.00109196, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.657512182257361, + "language_loss": 0.57396865, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59288216, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.1727001667022705 + }, + { + "auxiliary_loss_clip": 0.01134047, + "auxiliary_loss_mlp": 0.01104512, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.000561, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.704523815642885, + "language_loss": 0.8411746, + "learning_rate": 4.688851018730369e-07, + "loss": 0.8635602, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.6224522590637207 + }, + { + "auxiliary_loss_clip": 0.01148416, + "auxiliary_loss_mlp": 0.0110426, + "balance_loss_clip": 1.00189507, + "balance_loss_mlp": 1.00049996, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.371874393229044, + "language_loss": 0.88573432, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90826112, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.612051486968994 + }, + { + "auxiliary_loss_clip": 0.01133532, + "auxiliary_loss_mlp": 0.01105483, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00057888, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.6048779162000595, + "language_loss": 0.78923106, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81162119, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.597036600112915 + }, + { + "auxiliary_loss_clip": 0.01131895, + "auxiliary_loss_mlp": 0.01104644, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00040722, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.364014370066581, + "language_loss": 0.72172296, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74408829, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.6135199069976807 + }, + { + "auxiliary_loss_clip": 0.01103892, + "auxiliary_loss_mlp": 0.01104334, + "balance_loss_clip": 1.00185239, + "balance_loss_mlp": 1.00057459, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.8401218612770152, + "language_loss": 0.63355428, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65563655, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 2.7416067123413086 + }, + { + "auxiliary_loss_clip": 0.01150006, + "auxiliary_loss_mlp": 0.01104519, + "balance_loss_clip": 1.00183427, + "balance_loss_mlp": 1.00056803, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.87930975793778, + "language_loss": 0.7328065, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75535172, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.5702292919158936 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01104935, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00050795, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.756506684819552, + "language_loss": 0.7492395, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.77161258, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.635220766067505 + }, + { + "auxiliary_loss_clip": 0.0116545, + "auxiliary_loss_mlp": 0.01105885, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.6958415228429549, + "language_loss": 0.72845668, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.75117004, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.5080835819244385 + }, + { + "auxiliary_loss_clip": 0.01150768, + "auxiliary_loss_mlp": 0.01103534, + "balance_loss_clip": 1.00187016, + "balance_loss_mlp": 1.00053692, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.056221126950194, + "language_loss": 0.73485327, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75739622, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.5640742778778076 + }, + { + "auxiliary_loss_clip": 0.0116539, + "auxiliary_loss_mlp": 0.01105142, + "balance_loss_clip": 1.00198066, + "balance_loss_mlp": 1.00071454, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 1.78102262705957, + "language_loss": 0.72436905, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74707437, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.6671667098999023 + }, + { + "auxiliary_loss_clip": 0.01131725, + "auxiliary_loss_mlp": 0.01103387, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00048542, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 2.0488809780969826, + "language_loss": 0.69444352, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71679467, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.5370657444000244 + }, + { + "auxiliary_loss_clip": 0.01146629, + "auxiliary_loss_mlp": 0.01103275, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.525918145128131, + "language_loss": 0.70658821, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72908723, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.5505852699279785 + }, + { + "auxiliary_loss_clip": 0.01148691, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.0006249, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.9110223211743502, + "language_loss": 0.76117921, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78371179, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.5803864002227783 + }, + { + "auxiliary_loss_clip": 0.01165431, + "auxiliary_loss_mlp": 0.01105388, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00038862, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.6004330751834808, + "language_loss": 0.75040495, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77311319, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.4954628944396973 + }, + { + "auxiliary_loss_clip": 0.01081677, + "auxiliary_loss_mlp": 0.01103843, + "balance_loss_clip": 1.00159931, + "balance_loss_mlp": 1.00046492, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.7107478538808467, + "language_loss": 0.70211691, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72397208, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.7239434719085693 + }, + { + "auxiliary_loss_clip": 0.01113737, + "auxiliary_loss_mlp": 0.01104819, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00048721, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.2091662638644625, + "language_loss": 0.76567674, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.7878623, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.6020545959472656 + }, + { + "auxiliary_loss_clip": 0.01150539, + "auxiliary_loss_mlp": 0.0110465, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.5278236436126786, + "language_loss": 0.70966887, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73222071, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 2.549518346786499 + }, + { + "auxiliary_loss_clip": 0.01117473, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.00061834, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.9538296800418946, + "language_loss": 0.76658762, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78881091, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 4.021990537643433 + }, + { + "auxiliary_loss_clip": 0.01129794, + "auxiliary_loss_mlp": 0.01104086, + "balance_loss_clip": 1.00195205, + "balance_loss_mlp": 1.00051689, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 7.202413349199291, + "language_loss": 0.76973605, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79207486, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.5734846591949463 + }, + { + "auxiliary_loss_clip": 0.01116852, + "auxiliary_loss_mlp": 0.01103972, + "balance_loss_clip": 1.00149775, + "balance_loss_mlp": 1.00040293, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.90553279739511, + "language_loss": 0.74567968, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76788795, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.6459274291992188 + }, + { + "auxiliary_loss_clip": 0.011505, + "auxiliary_loss_mlp": 0.01104069, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00040507, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.414363948006013, + "language_loss": 0.68538821, + "learning_rate": 4.638853864505297e-07, + "loss": 0.7079339, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 2.5701799392700195 + }, + { + "auxiliary_loss_clip": 0.01150697, + "auxiliary_loss_mlp": 0.01104004, + "balance_loss_clip": 1.00209939, + "balance_loss_mlp": 1.00043559, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.746410647556095, + "language_loss": 0.72988486, + "learning_rate": 4.636360116707625e-07, + "loss": 0.75243187, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.624729871749878 + }, + { + "auxiliary_loss_clip": 0.01118721, + "auxiliary_loss_mlp": 0.0110395, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00047612, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 2.0206505070719794, + "language_loss": 0.68032813, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70255482, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.6114675998687744 + }, + { + "auxiliary_loss_clip": 0.01148969, + "auxiliary_loss_mlp": 0.01104425, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00066519, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.9942266297011166, + "language_loss": 0.76305354, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78558749, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.5546631813049316 + }, + { + "auxiliary_loss_clip": 0.01160909, + "auxiliary_loss_mlp": 0.01080858, + "balance_loss_clip": 1.00133848, + "balance_loss_mlp": 0.99998683, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 1.5180133170234111, + "language_loss": 0.53419638, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55661404, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.161734104156494 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01104144, + "balance_loss_clip": 1.00161684, + "balance_loss_mlp": 1.00038469, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.8302601596446118, + "language_loss": 0.67627811, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69831026, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.6770172119140625 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01103797, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00041842, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.7823638305104188, + "language_loss": 0.67831838, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70051831, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.6630847454071045 + }, + { + "auxiliary_loss_clip": 0.0114986, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.00198066, + "balance_loss_mlp": 1.00052571, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.532457724441328, + "language_loss": 0.7726174, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79516357, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.6203181743621826 + }, + { + "auxiliary_loss_clip": 0.01102081, + "auxiliary_loss_mlp": 0.01103851, + "balance_loss_clip": 1.00166357, + "balance_loss_mlp": 1.00047231, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.593540881544993, + "language_loss": 0.65885091, + "learning_rate": 4.618920199958083e-07, + "loss": 0.68091023, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 4.038980722427368 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_clip": 1.00160742, + "balance_loss_mlp": 1.00045037, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.5611102571490445, + "language_loss": 0.74004054, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76210308, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 4.137150764465332 + }, + { + "auxiliary_loss_clip": 0.01148961, + "auxiliary_loss_mlp": 0.01104968, + "balance_loss_clip": 1.00201809, + "balance_loss_mlp": 1.00044525, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 1.63107793256895, + "language_loss": 0.71255028, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73508954, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.5491085052490234 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01104364, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00060439, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.5691000004621583, + "language_loss": 0.76669335, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78905809, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 4.060185194015503 + }, + { + "auxiliary_loss_clip": 0.01114936, + "auxiliary_loss_mlp": 0.01103604, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00060713, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.6713257817119898, + "language_loss": 0.75118089, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77336627, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.6404261589050293 + }, + { + "auxiliary_loss_clip": 0.01116607, + "auxiliary_loss_mlp": 0.01103903, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00033402, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.5010094514946017, + "language_loss": 0.68672574, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.70893091, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.6515843868255615 + }, + { + "auxiliary_loss_clip": 0.01150539, + "auxiliary_loss_mlp": 0.01105024, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00050163, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 1.955885126692385, + "language_loss": 0.80010062, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82265627, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.4994187355041504 + }, + { + "auxiliary_loss_clip": 0.01148605, + "auxiliary_loss_mlp": 0.01104843, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00051057, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.7188737962574703, + "language_loss": 0.70845258, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73098707, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 2.636714458465576 + }, + { + "auxiliary_loss_clip": 0.01150494, + "auxiliary_loss_mlp": 0.01103838, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00065064, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.488710629006542, + "language_loss": 0.81359512, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83613837, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.584397554397583 + }, + { + "auxiliary_loss_clip": 0.01120076, + "auxiliary_loss_mlp": 0.01103961, + "balance_loss_clip": 1.00184667, + "balance_loss_mlp": 1.00048757, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.4475856961549614, + "language_loss": 0.68307483, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70531517, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.7220852375030518 + }, + { + "auxiliary_loss_clip": 0.0114849, + "auxiliary_loss_mlp": 0.01104773, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00053608, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.495177458120062, + "language_loss": 0.69589758, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71843016, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.5528223514556885 + }, + { + "auxiliary_loss_clip": 0.01131919, + "auxiliary_loss_mlp": 0.01104508, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00065315, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.4336795690902957, + "language_loss": 0.68383414, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70619845, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.6015055179595947 + }, + { + "auxiliary_loss_clip": 0.01134175, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00049102, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.7543617469369328, + "language_loss": 0.66437179, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68675792, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.620500326156616 + }, + { + "auxiliary_loss_clip": 0.01131883, + "auxiliary_loss_mlp": 0.01105151, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.168436066880153, + "language_loss": 0.74503016, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76740062, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.5607962608337402 + }, + { + "auxiliary_loss_clip": 0.01133491, + "auxiliary_loss_mlp": 0.01103948, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00057006, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.305063409438954, + "language_loss": 0.70324492, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72561932, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.5443758964538574 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.01105267, + "balance_loss_clip": 1.00178683, + "balance_loss_mlp": 1.00045872, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.5888901761840875, + "language_loss": 0.72244167, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74483156, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.5804553031921387 + }, + { + "auxiliary_loss_clip": 0.01165229, + "auxiliary_loss_mlp": 0.01104433, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00048232, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.6823267353499336, + "language_loss": 0.74700636, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76970297, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.5146918296813965 + }, + { + "auxiliary_loss_clip": 0.0113331, + "auxiliary_loss_mlp": 0.01102735, + "balance_loss_clip": 1.00172889, + "balance_loss_mlp": 1.00059617, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.5566511932700797, + "language_loss": 0.71482962, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73719007, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.6249618530273438 + }, + { + "auxiliary_loss_clip": 0.01160848, + "auxiliary_loss_mlp": 0.01080828, + "balance_loss_clip": 1.00127673, + "balance_loss_mlp": 0.99995679, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6700457875983009, + "language_loss": 0.55519629, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57761306, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.169093132019043 + }, + { + "auxiliary_loss_clip": 0.01143644, + "auxiliary_loss_mlp": 0.01080503, + "balance_loss_clip": 1.001158, + "balance_loss_mlp": 1.00001287, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.720394461624193, + "language_loss": 0.49971858, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52196002, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.1978375911712646 + }, + { + "auxiliary_loss_clip": 0.01149636, + "auxiliary_loss_mlp": 0.01103625, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.0005331, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.6671404337936775, + "language_loss": 0.83885193, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.86138451, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.597395896911621 + }, + { + "auxiliary_loss_clip": 0.01144365, + "auxiliary_loss_mlp": 0.01080451, + "balance_loss_clip": 1.0013119, + "balance_loss_mlp": 0.99996132, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7080805897670603, + "language_loss": 0.63942397, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66167223, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.121828079223633 + }, + { + "auxiliary_loss_clip": 0.01134028, + "auxiliary_loss_mlp": 0.01104201, + "balance_loss_clip": 1.00167584, + "balance_loss_mlp": 1.00053668, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.2693963570015874, + "language_loss": 0.793477, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81585926, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.5668044090270996 + }, + { + "auxiliary_loss_clip": 0.01132189, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00046086, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.7534895316183283, + "language_loss": 0.75726998, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77962738, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 2.5853991508483887 + }, + { + "auxiliary_loss_clip": 0.01104252, + "auxiliary_loss_mlp": 0.0110412, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00045574, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.765245140764797, + "language_loss": 0.7940461, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.8161298, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.6986629962921143 + }, + { + "auxiliary_loss_clip": 0.01148128, + "auxiliary_loss_mlp": 0.01105121, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00050306, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.0404572192226937, + "language_loss": 0.67898202, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70151448, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.6160454750061035 + }, + { + "auxiliary_loss_clip": 0.01133995, + "auxiliary_loss_mlp": 0.01102973, + "balance_loss_clip": 1.0017606, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.4668921711291636, + "language_loss": 0.70438647, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72675622, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 4.224232196807861 + }, + { + "auxiliary_loss_clip": 0.01134022, + "auxiliary_loss_mlp": 0.01105651, + "balance_loss_clip": 1.00195992, + "balance_loss_mlp": 1.00065124, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.5811774156061824, + "language_loss": 0.80608612, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82848287, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.638502597808838 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.01103641, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00054836, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.803277945001157, + "language_loss": 0.7462008, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76840699, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.699120283126831 + }, + { + "auxiliary_loss_clip": 0.01133787, + "auxiliary_loss_mlp": 0.01104103, + "balance_loss_clip": 1.00190341, + "balance_loss_mlp": 1.00034308, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.5603622931483823, + "language_loss": 0.78250861, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80488747, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.6205098628997803 + }, + { + "auxiliary_loss_clip": 0.01150817, + "auxiliary_loss_mlp": 0.00747428, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00037098, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.876228067026336, + "language_loss": 0.66165829, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68064076, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.5166616439819336 + }, + { + "auxiliary_loss_clip": 0.01131752, + "auxiliary_loss_mlp": 0.01103643, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00055063, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.4332483257413406, + "language_loss": 0.77734995, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.7997039, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 2.7528610229492188 + }, + { + "auxiliary_loss_clip": 0.01150576, + "auxiliary_loss_mlp": 0.01103965, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00068176, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 2.4547211450218485, + "language_loss": 0.82188898, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84443438, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.533048391342163 + }, + { + "auxiliary_loss_clip": 0.01148633, + "auxiliary_loss_mlp": 0.0110528, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00047171, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.088808294622289, + "language_loss": 0.80798018, + "learning_rate": 4.537088934794913e-07, + "loss": 0.83051932, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.5744011402130127 + }, + { + "auxiliary_loss_clip": 0.01165332, + "auxiliary_loss_mlp": 0.01104603, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00055695, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.5010875005945707, + "language_loss": 0.74065804, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.7633574, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.5014848709106445 + }, + { + "auxiliary_loss_clip": 0.01083582, + "auxiliary_loss_mlp": 0.01103976, + "balance_loss_clip": 1.00156426, + "balance_loss_mlp": 1.00059748, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 3.015061124108655, + "language_loss": 0.75882298, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78069854, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.7148609161376953 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.00046659, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.089717699167527, + "language_loss": 0.7291342, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75116318, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.641202926635742 + }, + { + "auxiliary_loss_clip": 0.0116514, + "auxiliary_loss_mlp": 0.01104895, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00056255, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.8660185422970983, + "language_loss": 0.73574531, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75844568, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.4967246055603027 + }, + { + "auxiliary_loss_clip": 0.0116083, + "auxiliary_loss_mlp": 0.01080848, + "balance_loss_clip": 1.00125718, + "balance_loss_mlp": 0.99997598, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8864909929224613, + "language_loss": 0.60294569, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62536246, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 4.575252056121826 + }, + { + "auxiliary_loss_clip": 0.01116627, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00055599, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.6541255396754013, + "language_loss": 0.72133487, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74354333, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.64577317237854 + }, + { + "auxiliary_loss_clip": 0.01100197, + "auxiliary_loss_mlp": 0.01103528, + "balance_loss_clip": 1.00158811, + "balance_loss_mlp": 1.00043559, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3550151229882497, + "language_loss": 0.75390601, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77594328, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 4.187507390975952 + }, + { + "auxiliary_loss_clip": 0.01150765, + "auxiliary_loss_mlp": 0.01103889, + "balance_loss_clip": 1.00191557, + "balance_loss_mlp": 1.00060618, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.0636279164299673, + "language_loss": 0.62087363, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.64342016, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.5702147483825684 + }, + { + "auxiliary_loss_clip": 0.01132164, + "auxiliary_loss_mlp": 0.01104188, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.00052381, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.6236218170721348, + "language_loss": 0.6750508, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69741428, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.6293904781341553 + }, + { + "auxiliary_loss_clip": 0.01116449, + "auxiliary_loss_mlp": 0.01104062, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.4537242689857166, + "language_loss": 0.57678354, + "learning_rate": 4.5124174933361e-07, + "loss": 0.59898865, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.587350845336914 + }, + { + "auxiliary_loss_clip": 0.01098731, + "auxiliary_loss_mlp": 0.01103989, + "balance_loss_clip": 1.00164533, + "balance_loss_mlp": 1.00041962, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.754836610840206, + "language_loss": 0.66809517, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69012237, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.7001426219940186 + }, + { + "auxiliary_loss_clip": 0.01134744, + "auxiliary_loss_mlp": 0.01104753, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00042105, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.9532600443250474, + "language_loss": 0.8877269, + "learning_rate": 4.50749024954048e-07, + "loss": 0.9101218, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 2.5762290954589844 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.01105523, + "balance_loss_clip": 1.00169563, + "balance_loss_mlp": 1.00061882, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 2.33568320063468, + "language_loss": 0.72960007, + "learning_rate": 4.505027508812245e-07, + "loss": 0.75197577, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.576004981994629 + }, + { + "auxiliary_loss_clip": 0.01148483, + "auxiliary_loss_mlp": 0.01103722, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00053477, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4686094369796976, + "language_loss": 0.80380559, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82632768, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.52194881439209 + }, + { + "auxiliary_loss_clip": 0.01148571, + "auxiliary_loss_mlp": 0.01104999, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00047612, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.6301996304266337, + "language_loss": 0.73127502, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75381076, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.5584323406219482 + }, + { + "auxiliary_loss_clip": 0.01150455, + "auxiliary_loss_mlp": 0.01104193, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00043297, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.3963565537374947, + "language_loss": 0.71889675, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74144328, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.6056244373321533 + }, + { + "auxiliary_loss_clip": 0.01133564, + "auxiliary_loss_mlp": 0.0074731, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00042951, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4623661170103028, + "language_loss": 0.78804672, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80685544, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.741516351699829 + }, + { + "auxiliary_loss_clip": 0.01150629, + "auxiliary_loss_mlp": 0.01103639, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00054681, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3345544869948431, + "language_loss": 0.80092084, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.8234635, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.6094424724578857 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.01104051, + "balance_loss_clip": 1.00189126, + "balance_loss_mlp": 1.00048232, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 1.7800021284625969, + "language_loss": 0.78395677, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80629528, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.5741689205169678 + }, + { + "auxiliary_loss_clip": 0.01119967, + "auxiliary_loss_mlp": 0.01104225, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.000561, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 2.0187544726911275, + "language_loss": 0.66834044, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69058239, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.6117796897888184 + }, + { + "auxiliary_loss_clip": 0.01134005, + "auxiliary_loss_mlp": 0.0110463, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00048912, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 1.9443351527959563, + "language_loss": 0.72947943, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75186574, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.6721785068511963 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01105211, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.0004971, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.051880693854685, + "language_loss": 0.72490954, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74731994, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.6257007122039795 + }, + { + "auxiliary_loss_clip": 0.01135379, + "auxiliary_loss_mlp": 0.0110502, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00049698, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.6799628856684572, + "language_loss": 0.76806045, + "learning_rate": 4.480432433327845e-07, + "loss": 0.7904644, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.571882963180542 + }, + { + "auxiliary_loss_clip": 0.01150267, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00062096, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.6924524685099762, + "language_loss": 0.85853827, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88108861, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.580838203430176 + }, + { + "auxiliary_loss_clip": 0.01150418, + "auxiliary_loss_mlp": 0.01104143, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00066876, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.053028826482059, + "language_loss": 0.69528371, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71782929, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.554382085800171 + }, + { + "auxiliary_loss_clip": 0.01146214, + "auxiliary_loss_mlp": 0.01080857, + "balance_loss_clip": 1.00129199, + "balance_loss_mlp": 0.99998587, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 1.1439865470468207, + "language_loss": 0.61602247, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63829321, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.131531000137329 + }, + { + "auxiliary_loss_clip": 0.0114828, + "auxiliary_loss_mlp": 0.01104004, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00043488, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.7353351344451904, + "language_loss": 0.73637426, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.75889707, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 2.5868747234344482 + }, + { + "auxiliary_loss_clip": 0.01130398, + "auxiliary_loss_mlp": 0.0110655, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00040603, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.0599684025301404, + "language_loss": 0.6856426, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.7080121, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.563157320022583 + }, + { + "auxiliary_loss_clip": 0.01148581, + "auxiliary_loss_mlp": 0.01103649, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00065255, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 1.9320821280306024, + "language_loss": 0.62513858, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64766091, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.5273189544677734 + }, + { + "auxiliary_loss_clip": 0.01116459, + "auxiliary_loss_mlp": 0.01105838, + "balance_loss_clip": 1.00184095, + "balance_loss_mlp": 1.00055218, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.023719606277565, + "language_loss": 0.79743928, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81966221, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.5784969329833984 + }, + { + "auxiliary_loss_clip": 0.01150679, + "auxiliary_loss_mlp": 0.01103774, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00058627, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.7362215892427044, + "language_loss": 0.80497527, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82751977, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 4.007436513900757 + }, + { + "auxiliary_loss_clip": 0.01147994, + "auxiliary_loss_mlp": 0.01104027, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00045848, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.548557998467883, + "language_loss": 0.72587007, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74839032, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.5850086212158203 + }, + { + "auxiliary_loss_clip": 0.01165458, + "auxiliary_loss_mlp": 0.0110549, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00068116, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.4472300585015767, + "language_loss": 0.70506358, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72777313, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.4620280265808105 + }, + { + "auxiliary_loss_clip": 0.01165205, + "auxiliary_loss_mlp": 0.01103935, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00065207, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.5738078954542976, + "language_loss": 0.74169421, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76438558, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.5078935623168945 + }, + { + "auxiliary_loss_clip": 0.01115405, + "auxiliary_loss_mlp": 0.01104364, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00069928, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 1.9048818154537568, + "language_loss": 0.68710923, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70930701, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.593578815460205 + }, + { + "auxiliary_loss_clip": 0.01146181, + "auxiliary_loss_mlp": 0.01080841, + "balance_loss_clip": 1.00126481, + "balance_loss_mlp": 0.99996912, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.9260990217418671, + "language_loss": 0.60201848, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62428868, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.2090253829956055 + }, + { + "auxiliary_loss_clip": 0.01165388, + "auxiliary_loss_mlp": 0.01104103, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.0005343, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.612583004565929, + "language_loss": 0.76240832, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.7851032, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 2.586278200149536 + }, + { + "auxiliary_loss_clip": 0.01148709, + "auxiliary_loss_mlp": 0.01104625, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00057888, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.6072124866980158, + "language_loss": 0.68464941, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70718277, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.5403428077697754 + }, + { + "auxiliary_loss_clip": 0.01083766, + "auxiliary_loss_mlp": 0.01080596, + "balance_loss_clip": 1.00129151, + "balance_loss_mlp": 1.00010622, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8923838552371165, + "language_loss": 0.59964609, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62128973, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.052435874938965 + }, + { + "auxiliary_loss_clip": 0.01116432, + "auxiliary_loss_mlp": 0.01103988, + "balance_loss_clip": 1.00180447, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.4623552535228077, + "language_loss": 0.74309784, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76530206, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 2.7357594966888428 + }, + { + "auxiliary_loss_clip": 0.0115024, + "auxiliary_loss_mlp": 0.01104261, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.00050092, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.833855180568857, + "language_loss": 0.83124638, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85379136, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.5489842891693115 + }, + { + "auxiliary_loss_clip": 0.01148748, + "auxiliary_loss_mlp": 0.01103322, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00042057, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.6288922203911453, + "language_loss": 0.73051393, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75303459, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.5953211784362793 + }, + { + "auxiliary_loss_clip": 0.01165286, + "auxiliary_loss_mlp": 0.01104539, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00049305, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.792534726427853, + "language_loss": 0.75406241, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77676064, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 3.9908664226531982 + }, + { + "auxiliary_loss_clip": 0.01148773, + "auxiliary_loss_mlp": 0.01104201, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00063205, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.7622968396586522, + "language_loss": 0.72212481, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74465454, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 3.955815315246582 + }, + { + "auxiliary_loss_clip": 0.01150626, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_clip": 1.00186622, + "balance_loss_mlp": 1.00056028, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.8916673326782363, + "language_loss": 0.71701181, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.7395699, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 3.981621026992798 + }, + { + "auxiliary_loss_clip": 0.01121448, + "auxiliary_loss_mlp": 0.01104841, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00050914, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 1.9544252924640861, + "language_loss": 0.6507411, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67300397, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 2.665475368499756 + }, + { + "auxiliary_loss_clip": 0.01165415, + "auxiliary_loss_mlp": 0.01103537, + "balance_loss_clip": 1.00197017, + "balance_loss_mlp": 1.00054038, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 1.6496820938734371, + "language_loss": 0.69577765, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71846718, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.5003128051757812 + }, + { + "auxiliary_loss_clip": 0.01133407, + "auxiliary_loss_mlp": 0.01105079, + "balance_loss_clip": 1.00181901, + "balance_loss_mlp": 1.00055599, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.2847887738521189, + "language_loss": 0.70063198, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72301692, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.743194103240967 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.00747238, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00040507, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.9417927824239138, + "language_loss": 0.72779036, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.7464301, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.6032466888427734 + }, + { + "auxiliary_loss_clip": 0.01165271, + "auxiliary_loss_mlp": 0.01104932, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.494293317708329, + "language_loss": 0.78811073, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81081277, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.49515700340271 + }, + { + "auxiliary_loss_clip": 0.01148804, + "auxiliary_loss_mlp": 0.01105573, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00038278, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.4652788641385137, + "language_loss": 0.69916767, + "learning_rate": 4.411879602612185e-07, + "loss": 0.7217114, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.5298588275909424 + }, + { + "auxiliary_loss_clip": 0.01165453, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00050974, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.5693383131096592, + "language_loss": 0.7664783, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78918129, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.524641275405884 + }, + { + "auxiliary_loss_clip": 0.01120544, + "auxiliary_loss_mlp": 0.01103936, + "balance_loss_clip": 1.00164962, + "balance_loss_mlp": 1.00055742, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.6612815886538344, + "language_loss": 0.651775, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67401981, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.7142372131347656 + }, + { + "auxiliary_loss_clip": 0.01150752, + "auxiliary_loss_mlp": 0.01104753, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.0006119, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.8708804549471805, + "language_loss": 0.73638868, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.75894374, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.6210622787475586 + }, + { + "auxiliary_loss_clip": 0.01148526, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00064051, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.4428416987542056, + "language_loss": 0.67544109, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69795793, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.524693727493286 + }, + { + "auxiliary_loss_clip": 0.01148502, + "auxiliary_loss_mlp": 0.01103976, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.7935075193729382, + "language_loss": 0.67258549, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69511032, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.5418152809143066 + }, + { + "auxiliary_loss_clip": 0.01133084, + "auxiliary_loss_mlp": 0.01102362, + "balance_loss_clip": 1.00165153, + "balance_loss_mlp": 1.00050986, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.219754504607329, + "language_loss": 0.72681284, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74916726, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.5562984943389893 + }, + { + "auxiliary_loss_clip": 0.01133591, + "auxiliary_loss_mlp": 0.01105016, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00058877, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.5340369189354153, + "language_loss": 0.73345697, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75584304, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.6311557292938232 + }, + { + "auxiliary_loss_clip": 0.01131911, + "auxiliary_loss_mlp": 0.01104494, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00044847, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.6345224423644023, + "language_loss": 0.71909565, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74145973, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.583186149597168 + }, + { + "auxiliary_loss_clip": 0.01118637, + "auxiliary_loss_mlp": 0.0110429, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00052977, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.8389571207035469, + "language_loss": 0.69871783, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72094715, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.66042160987854 + }, + { + "auxiliary_loss_clip": 0.01115744, + "auxiliary_loss_mlp": 0.01104356, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00050139, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 2.4296603575462368, + "language_loss": 0.66654789, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68874896, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.654996156692505 + }, + { + "auxiliary_loss_clip": 0.01101842, + "auxiliary_loss_mlp": 0.01102851, + "balance_loss_clip": 1.00165236, + "balance_loss_mlp": 1.00042641, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.9996447343337223, + "language_loss": 0.72457772, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74662471, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.6366536617279053 + }, + { + "auxiliary_loss_clip": 0.01165239, + "auxiliary_loss_mlp": 0.0110466, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.0006144, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.5288087507461243, + "language_loss": 0.77290416, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79560316, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.542151689529419 + }, + { + "auxiliary_loss_clip": 0.01114484, + "auxiliary_loss_mlp": 0.01103417, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00051522, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6527428147440824, + "language_loss": 0.8422941, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.8644731, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.6411402225494385 + }, + { + "auxiliary_loss_clip": 0.0116523, + "auxiliary_loss_mlp": 0.01103989, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00042009, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.5529682687398911, + "language_loss": 0.7259503, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74864244, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.494736909866333 + }, + { + "auxiliary_loss_clip": 0.01165377, + "auxiliary_loss_mlp": 0.01104824, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.0004921, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.8484082537417528, + "language_loss": 0.67043006, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69313204, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 2.664623498916626 + }, + { + "auxiliary_loss_clip": 0.01150005, + "auxiliary_loss_mlp": 0.011043, + "balance_loss_clip": 1.00180399, + "balance_loss_mlp": 1.00044453, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.6868121902118318, + "language_loss": 0.7096734, + "learning_rate": 4.372914494109412e-07, + "loss": 0.73221642, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.5337533950805664 + }, + { + "auxiliary_loss_clip": 0.01148512, + "auxiliary_loss_mlp": 0.0110404, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00047076, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 2.2516636054008203, + "language_loss": 0.66696352, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68948907, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 2.6306443214416504 + }, + { + "auxiliary_loss_clip": 0.01133719, + "auxiliary_loss_mlp": 0.01103597, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00050473, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.7397136445858772, + "language_loss": 0.79290867, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81528181, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 3.993598222732544 + }, + { + "auxiliary_loss_clip": 0.01120118, + "auxiliary_loss_mlp": 0.01103985, + "balance_loss_clip": 1.00169182, + "balance_loss_mlp": 1.0004158, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 2.510464973756063, + "language_loss": 0.7655043, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78774536, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.6383049488067627 + }, + { + "auxiliary_loss_clip": 0.01135235, + "auxiliary_loss_mlp": 0.01103261, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00055075, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.5212593981151163, + "language_loss": 0.7149297, + "learning_rate": 4.363196905447297e-07, + "loss": 0.7373147, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.6388626098632812 + }, + { + "auxiliary_loss_clip": 0.01150439, + "auxiliary_loss_mlp": 0.01104506, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00046015, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.8784297479253182, + "language_loss": 0.59659147, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61914086, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.532176971435547 + }, + { + "auxiliary_loss_clip": 0.01165252, + "auxiliary_loss_mlp": 0.0110416, + "balance_loss_clip": 1.00206828, + "balance_loss_mlp": 1.00059152, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.7096513501011397, + "language_loss": 0.7382412, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.76093537, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.4924702644348145 + }, + { + "auxiliary_loss_clip": 0.01150787, + "auxiliary_loss_mlp": 0.01104026, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.00055242, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.030714492124233, + "language_loss": 0.64212507, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66467321, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.5126261711120605 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01103362, + "balance_loss_clip": 1.00161374, + "balance_loss_mlp": 1.00046015, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4468022502915892, + "language_loss": 0.68790936, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.71025902, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.656546115875244 + }, + { + "auxiliary_loss_clip": 0.01165175, + "auxiliary_loss_mlp": 0.01104168, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.0005033, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 4.112876404115346, + "language_loss": 0.73832625, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76101965, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.5077850818634033 + }, + { + "auxiliary_loss_clip": 0.01148967, + "auxiliary_loss_mlp": 0.01105721, + "balance_loss_clip": 1.00181973, + "balance_loss_mlp": 1.00062561, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.341756616805399, + "language_loss": 0.81423128, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83677816, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.534313917160034 + }, + { + "auxiliary_loss_clip": 0.01134005, + "auxiliary_loss_mlp": 0.01104013, + "balance_loss_clip": 1.00165057, + "balance_loss_mlp": 1.00063503, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 11.799143220213297, + "language_loss": 0.77549279, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79787302, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.5918571949005127 + }, + { + "auxiliary_loss_clip": 0.01146744, + "auxiliary_loss_mlp": 0.01105971, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00058997, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.806966715452758, + "language_loss": 0.73850226, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76102942, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.5295605659484863 + }, + { + "auxiliary_loss_clip": 0.01133933, + "auxiliary_loss_mlp": 0.01104548, + "balance_loss_clip": 1.00190318, + "balance_loss_mlp": 1.00050187, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6464054768282441, + "language_loss": 0.67949545, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70188022, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 2.6962411403656006 + }, + { + "auxiliary_loss_clip": 0.01115532, + "auxiliary_loss_mlp": 0.01104784, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00064266, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.62848227257055, + "language_loss": 0.70922142, + "learning_rate": 4.338944453112907e-07, + "loss": 0.73142457, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 4.15122127532959 + }, + { + "auxiliary_loss_clip": 0.01149041, + "auxiliary_loss_mlp": 0.01104568, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.691109903561929, + "language_loss": 0.65430689, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67684293, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 3.9269752502441406 + }, + { + "auxiliary_loss_clip": 0.01150754, + "auxiliary_loss_mlp": 0.01103895, + "balance_loss_clip": 1.00192583, + "balance_loss_mlp": 1.0006125, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.7648175179350958, + "language_loss": 0.76960981, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79215628, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.5701653957366943 + }, + { + "auxiliary_loss_clip": 0.01135383, + "auxiliary_loss_mlp": 0.01103226, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 1.9817029821237988, + "language_loss": 0.72441274, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74679881, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.5423970222473145 + }, + { + "auxiliary_loss_clip": 0.01165231, + "auxiliary_loss_mlp": 0.00747536, + "balance_loss_clip": 1.0018276, + "balance_loss_mlp": 1.00040853, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.8534195685983095, + "language_loss": 0.63287425, + "learning_rate": 4.329260095357725e-07, + "loss": 0.65200186, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.4996907711029053 + }, + { + "auxiliary_loss_clip": 0.01098615, + "auxiliary_loss_mlp": 0.01103616, + "balance_loss_clip": 1.00163591, + "balance_loss_mlp": 1.00061893, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 1.8093148125934253, + "language_loss": 0.72475427, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74677658, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.613162040710449 + }, + { + "auxiliary_loss_clip": 0.01148485, + "auxiliary_loss_mlp": 0.01103663, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00047541, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.6860473712122073, + "language_loss": 0.73096865, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75349009, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.58949613571167 + }, + { + "auxiliary_loss_clip": 0.01148437, + "auxiliary_loss_mlp": 0.01104123, + "balance_loss_clip": 1.00180781, + "balance_loss_mlp": 1.00055397, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.5702013827957875, + "language_loss": 0.68626863, + "learning_rate": 4.322003066198219e-07, + "loss": 0.70879424, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.53579044342041 + }, + { + "auxiliary_loss_clip": 0.01120192, + "auxiliary_loss_mlp": 0.01104877, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.0005455, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.4965436114745603, + "language_loss": 0.74849343, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77074414, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.6300787925720215 + }, + { + "auxiliary_loss_clip": 0.01148586, + "auxiliary_loss_mlp": 0.01104567, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00052094, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.4159532583852552, + "language_loss": 0.72012627, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74265778, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.6341888904571533 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.0110529, + "balance_loss_clip": 1.00190866, + "balance_loss_mlp": 1.00057614, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 2.9442363891150816, + "language_loss": 0.70105553, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72376245, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.5090439319610596 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01104438, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00058341, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 2.0891347067925397, + "language_loss": 0.77802455, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.80007327, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 2.7068066596984863 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.01104731, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00068569, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.473291316687242, + "language_loss": 0.68616968, + "learning_rate": 4.309919909045268e-07, + "loss": 0.70836926, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.7406511306762695 + }, + { + "auxiliary_loss_clip": 0.01148501, + "auxiliary_loss_mlp": 0.0110393, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00045633, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 2.1686523250356897, + "language_loss": 0.65061879, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67314315, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.622190237045288 + }, + { + "auxiliary_loss_clip": 0.01100369, + "auxiliary_loss_mlp": 0.01103111, + "balance_loss_clip": 1.00143123, + "balance_loss_mlp": 1.00049531, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.2628026477403145, + "language_loss": 0.72352433, + "learning_rate": 4.30509081032864e-07, + "loss": 0.7455591, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 2.6281464099884033 + }, + { + "auxiliary_loss_clip": 0.01136169, + "auxiliary_loss_mlp": 0.01103702, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00061035, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.9652297691287324, + "language_loss": 0.80435789, + "learning_rate": 4.302677153653349e-07, + "loss": 0.8267566, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.590394973754883 + }, + { + "auxiliary_loss_clip": 0.01148429, + "auxiliary_loss_mlp": 0.01103458, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00065184, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.5479897624577046, + "language_loss": 0.77153134, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79405022, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.540985584259033 + }, + { + "auxiliary_loss_clip": 0.01165133, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.00188267, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.4621694955047437, + "language_loss": 0.67190135, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69459581, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.5154166221618652 + }, + { + "auxiliary_loss_clip": 0.01148196, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00041413, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 2.902788803799401, + "language_loss": 0.75239944, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77492791, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.554513692855835 + }, + { + "auxiliary_loss_clip": 0.01085632, + "auxiliary_loss_mlp": 0.01104624, + "balance_loss_clip": 1.00160635, + "balance_loss_mlp": 1.00048304, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 2.04742969525629, + "language_loss": 0.66124141, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68314397, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.751425266265869 + }, + { + "auxiliary_loss_clip": 0.01083931, + "auxiliary_loss_mlp": 0.01102012, + "balance_loss_clip": 1.0013938, + "balance_loss_mlp": 1.00035083, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3669140717108712, + "language_loss": 0.79307234, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81493175, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.747502088546753 + }, + { + "auxiliary_loss_clip": 0.01120979, + "auxiliary_loss_mlp": 0.01104118, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00054908, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.7757498823126603, + "language_loss": 0.77550006, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79775095, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.62600040435791 + }, + { + "auxiliary_loss_clip": 0.01133514, + "auxiliary_loss_mlp": 0.01104805, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00056803, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.9504464338196619, + "language_loss": 0.78882265, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81120586, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.5874462127685547 + }, + { + "auxiliary_loss_clip": 0.011164, + "auxiliary_loss_mlp": 0.01104155, + "balance_loss_clip": 1.00159574, + "balance_loss_mlp": 1.00058627, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.8969683042157766, + "language_loss": 0.84052807, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86273366, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.6435108184814453 + }, + { + "auxiliary_loss_clip": 0.01098519, + "auxiliary_loss_mlp": 0.01080487, + "balance_loss_clip": 1.00116301, + "balance_loss_mlp": 0.99999696, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.724395867826351, + "language_loss": 0.58307236, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60486245, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 3.2987654209136963 + }, + { + "auxiliary_loss_clip": 0.01119021, + "auxiliary_loss_mlp": 0.0110485, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00061321, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 3.0975962328984714, + "language_loss": 0.63232827, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.654567, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 4.170664072036743 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.01104451, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00059628, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.603294447712832, + "language_loss": 0.68943387, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71196604, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 2.596860885620117 + }, + { + "auxiliary_loss_clip": 0.01148569, + "auxiliary_loss_mlp": 0.0110427, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00070119, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.5512714423961937, + "language_loss": 0.7249589, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74748731, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.595228910446167 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01102744, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00050986, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 2.2898977506291396, + "language_loss": 0.80666888, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82919627, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.5651233196258545 + }, + { + "auxiliary_loss_clip": 0.01148254, + "auxiliary_loss_mlp": 0.01105183, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.0004698, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.5410047431943354, + "language_loss": 0.67843986, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70097423, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.522742509841919 + }, + { + "auxiliary_loss_clip": 0.01165342, + "auxiliary_loss_mlp": 0.01103439, + "balance_loss_clip": 1.00205314, + "balance_loss_mlp": 1.0005374, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.8404192041746787, + "language_loss": 0.72791886, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.75060666, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.5325348377227783 + }, + { + "auxiliary_loss_clip": 0.01101891, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_clip": 1.00160789, + "balance_loss_mlp": 1.00046039, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5612707971073108, + "language_loss": 0.78689045, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.8089515, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.7075977325439453 + }, + { + "auxiliary_loss_clip": 0.01150627, + "auxiliary_loss_mlp": 0.01103874, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00059164, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6198986864721152, + "language_loss": 0.73624146, + "learning_rate": 4.261736137111598e-07, + "loss": 0.75878644, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.5883290767669678 + }, + { + "auxiliary_loss_clip": 0.01133879, + "auxiliary_loss_mlp": 0.01103122, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00060153, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.7718101808070899, + "language_loss": 0.74219501, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76456505, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.5636818408966064 + }, + { + "auxiliary_loss_clip": 0.01148166, + "auxiliary_loss_mlp": 0.0110441, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00045907, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.9495105063677027, + "language_loss": 0.83311415, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85563987, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.5286595821380615 + }, + { + "auxiliary_loss_clip": 0.01150478, + "auxiliary_loss_mlp": 0.01105081, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.758623527139833, + "language_loss": 0.75335741, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.775913, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.5731425285339355 + }, + { + "auxiliary_loss_clip": 0.01131287, + "auxiliary_loss_mlp": 0.01104852, + "balance_loss_clip": 1.00198281, + "balance_loss_mlp": 1.00071025, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.5980459824836166, + "language_loss": 0.72537911, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74774051, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.7016725540161133 + }, + { + "auxiliary_loss_clip": 0.01149667, + "auxiliary_loss_mlp": 0.01103964, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00049019, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 1.8300808122638497, + "language_loss": 0.74824059, + "learning_rate": 4.249727465395634e-07, + "loss": 0.77077687, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 3.9209558963775635 + }, + { + "auxiliary_loss_clip": 0.0112763, + "auxiliary_loss_mlp": 0.01080494, + "balance_loss_clip": 1.00127268, + "balance_loss_mlp": 1.0000037, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7677430070027301, + "language_loss": 0.67071193, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69279319, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 5.864396333694458 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.01103162, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00054669, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.8161188794020582, + "language_loss": 0.71188843, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73440552, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.563128709793091 + }, + { + "auxiliary_loss_clip": 0.01160826, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_clip": 1.00129724, + "balance_loss_mlp": 0.99996608, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6665437077597967, + "language_loss": 0.55009633, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57251292, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 3.174144744873047 + }, + { + "auxiliary_loss_clip": 0.01133038, + "auxiliary_loss_mlp": 0.01102999, + "balance_loss_clip": 1.00168359, + "balance_loss_mlp": 1.00038397, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 2.0042966957153543, + "language_loss": 0.65063626, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67299664, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.5749454498291016 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01104498, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.00054789, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.0147523854363834, + "language_loss": 0.70274514, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72477245, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.779662609100342 + }, + { + "auxiliary_loss_clip": 0.01114307, + "auxiliary_loss_mlp": 0.01102902, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00038242, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.8069260270863503, + "language_loss": 0.69175088, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71392298, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.649428129196167 + }, + { + "auxiliary_loss_clip": 0.01102383, + "auxiliary_loss_mlp": 0.01104098, + "balance_loss_clip": 1.00167751, + "balance_loss_mlp": 1.00062406, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.4302953586819915, + "language_loss": 0.70555657, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72762138, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.843949556350708 + }, + { + "auxiliary_loss_clip": 0.01148315, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00054574, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.714879257012532, + "language_loss": 0.71765536, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74019873, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.6005706787109375 + }, + { + "auxiliary_loss_clip": 0.01127632, + "auxiliary_loss_mlp": 0.01080858, + "balance_loss_clip": 1.00138807, + "balance_loss_mlp": 0.99998629, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.912977282363307, + "language_loss": 0.63636231, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65844727, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.1844120025634766 + }, + { + "auxiliary_loss_clip": 0.01133375, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_clip": 1.00173998, + "balance_loss_mlp": 1.00041294, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.6774096241398597, + "language_loss": 0.6953271, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71769303, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.5846548080444336 + }, + { + "auxiliary_loss_clip": 0.01150497, + "auxiliary_loss_mlp": 0.01103287, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00038552, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.9259592722801648, + "language_loss": 0.78209853, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80463636, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.585411310195923 + }, + { + "auxiliary_loss_clip": 0.01148625, + "auxiliary_loss_mlp": 0.01104228, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00046778, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.99007577767883, + "language_loss": 0.79110569, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81363422, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.5556368827819824 + }, + { + "auxiliary_loss_clip": 0.0113281, + "auxiliary_loss_mlp": 0.00747348, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00035548, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.650448233282436, + "language_loss": 0.7016778, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72047937, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.579345226287842 + }, + { + "auxiliary_loss_clip": 0.01117435, + "auxiliary_loss_mlp": 0.01104698, + "balance_loss_clip": 1.00177515, + "balance_loss_mlp": 1.00046146, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.3817646108000883, + "language_loss": 0.67838156, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70060289, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.636852264404297 + }, + { + "auxiliary_loss_clip": 0.01100929, + "auxiliary_loss_mlp": 0.01102863, + "balance_loss_clip": 1.00168014, + "balance_loss_mlp": 1.00043869, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.5690335768825217, + "language_loss": 0.74880135, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77083927, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.6782541275024414 + }, + { + "auxiliary_loss_clip": 0.01148537, + "auxiliary_loss_mlp": 0.01104551, + "balance_loss_clip": 1.0019263, + "balance_loss_mlp": 1.00050509, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.09561975222759, + "language_loss": 0.7160337, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73856461, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.540580987930298 + }, + { + "auxiliary_loss_clip": 0.01148808, + "auxiliary_loss_mlp": 0.01104211, + "balance_loss_clip": 1.00166512, + "balance_loss_mlp": 1.00045085, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.6317522577922863, + "language_loss": 0.73719716, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75972736, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.5077123641967773 + }, + { + "auxiliary_loss_clip": 0.01165424, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00058949, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.7602029900169807, + "language_loss": 0.69432747, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71702421, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 2.5553529262542725 + }, + { + "auxiliary_loss_clip": 0.01146261, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_clip": 1.00126755, + "balance_loss_mlp": 0.99998361, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8850929560041223, + "language_loss": 0.58649993, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60876727, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 2.9636447429656982 + }, + { + "auxiliary_loss_clip": 0.01116608, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00058079, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 5.890916020997167, + "language_loss": 0.64936239, + "learning_rate": 4.201842205128772e-07, + "loss": 0.67156613, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.76865816116333 + }, + { + "auxiliary_loss_clip": 0.01165312, + "auxiliary_loss_mlp": 0.01104164, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00059462, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 1.6903162522315598, + "language_loss": 0.7574544, + "learning_rate": 4.199454226296526e-07, + "loss": 0.7801491, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.5081281661987305 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01104007, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00053346, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.6199604915915458, + "language_loss": 0.7932198, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81544483, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.6697208881378174 + }, + { + "auxiliary_loss_clip": 0.01150568, + "auxiliary_loss_mlp": 0.01103704, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.00042081, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.8512432058979513, + "language_loss": 0.68320221, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70574492, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.519110679626465 + }, + { + "auxiliary_loss_clip": 0.01134028, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_clip": 1.00178611, + "balance_loss_mlp": 1.00061715, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3545068689861735, + "language_loss": 0.79113925, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81351471, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.588914155960083 + }, + { + "auxiliary_loss_clip": 0.01133001, + "auxiliary_loss_mlp": 0.01104403, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00045228, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.8181843810557803, + "language_loss": 0.6640473, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68642139, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 4.139112710952759 + }, + { + "auxiliary_loss_clip": 0.01131925, + "auxiliary_loss_mlp": 0.01103223, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00051272, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 1.8004723863836838, + "language_loss": 0.71695936, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.7393108, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 2.6255078315734863 + }, + { + "auxiliary_loss_clip": 0.0113441, + "auxiliary_loss_mlp": 0.01104564, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00051808, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.0381289601811585, + "language_loss": 0.76385957, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78624928, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.5968761444091797 + }, + { + "auxiliary_loss_clip": 0.01132154, + "auxiliary_loss_mlp": 0.01103389, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00039196, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.184820113807765, + "language_loss": 0.61780328, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.64015871, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 2.5702269077301025 + }, + { + "auxiliary_loss_clip": 0.01135025, + "auxiliary_loss_mlp": 0.01103117, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00050163, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.010138704521087, + "language_loss": 0.72196424, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74434566, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 2.566786289215088 + }, + { + "auxiliary_loss_clip": 0.01165361, + "auxiliary_loss_mlp": 0.01104445, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00058949, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.7669202761349025, + "language_loss": 0.72518629, + "learning_rate": 4.177989389787624e-07, + "loss": 0.74788433, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.5201046466827393 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01102588, + "balance_loss_clip": 1.00199735, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.560980595342227, + "language_loss": 0.66097057, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68364775, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.567697048187256 + }, + { + "auxiliary_loss_clip": 0.01118466, + "auxiliary_loss_mlp": 0.01103831, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00054824, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.5895871168289564, + "language_loss": 0.67757666, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69979966, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.635533332824707 + }, + { + "auxiliary_loss_clip": 0.01150079, + "auxiliary_loss_mlp": 0.01103733, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00064075, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.728570078903234, + "language_loss": 0.69416839, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71670651, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.5650393962860107 + }, + { + "auxiliary_loss_clip": 0.01165142, + "auxiliary_loss_mlp": 0.01103469, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00047183, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.455317199697836, + "language_loss": 0.79463756, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81732368, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.5047073364257812 + }, + { + "auxiliary_loss_clip": 0.01148477, + "auxiliary_loss_mlp": 0.01104085, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00051594, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.6368628657142992, + "language_loss": 0.65990943, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68243504, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.5691020488739014 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01105217, + "balance_loss_clip": 1.00198901, + "balance_loss_mlp": 1.00050378, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.686667549308943, + "language_loss": 0.7215873, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74398232, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 2.576026201248169 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01104394, + "balance_loss_clip": 1.00190556, + "balance_loss_mlp": 1.00044298, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.6412238642209673, + "language_loss": 0.68581676, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.70836627, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.5396833419799805 + }, + { + "auxiliary_loss_clip": 0.0114838, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00052893, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.6464818616408297, + "language_loss": 0.73946381, + "learning_rate": 4.158950331167641e-07, + "loss": 0.76197243, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 4.111054420471191 + }, + { + "auxiliary_loss_clip": 0.01133507, + "auxiliary_loss_mlp": 0.011034, + "balance_loss_clip": 1.00166678, + "balance_loss_mlp": 1.00049889, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 3.2248669547026108, + "language_loss": 0.78257334, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80494249, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 5.5068535804748535 + }, + { + "auxiliary_loss_clip": 0.01150235, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00053668, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.5100759162391626, + "language_loss": 0.76456594, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78708839, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.563941240310669 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01105182, + "balance_loss_clip": 1.00180376, + "balance_loss_mlp": 1.00046802, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.27725161765785, + "language_loss": 0.70541453, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72762913, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.01148684, + "auxiliary_loss_mlp": 0.01104276, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00061214, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.7772064733635984, + "language_loss": 0.70905471, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73158431, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.5557281970977783 + }, + { + "auxiliary_loss_clip": 0.01165002, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00062454, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.618008403131181, + "language_loss": 0.77344739, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79612792, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.507751226425171 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.01103553, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00046122, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.9658946531366415, + "language_loss": 0.75473768, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77694356, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.616187572479248 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01103441, + "balance_loss_clip": 1.00188148, + "balance_loss_mlp": 1.00044394, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 4.585787015186868, + "language_loss": 0.84126288, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86361468, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.574505090713501 + }, + { + "auxiliary_loss_clip": 0.01148439, + "auxiliary_loss_mlp": 0.01104199, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00053418, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.5608377642823907, + "language_loss": 0.75946397, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78199029, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.549865484237671 + }, + { + "auxiliary_loss_clip": 0.01165256, + "auxiliary_loss_mlp": 0.01103772, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.000489, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.6209800281611826, + "language_loss": 0.7759732, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.7986635, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.536107301712036 + }, + { + "auxiliary_loss_clip": 0.01150067, + "auxiliary_loss_mlp": 0.01103827, + "balance_loss_clip": 1.00174165, + "balance_loss_mlp": 1.00054431, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.9064891647881614, + "language_loss": 0.81925547, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84179437, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.562366247177124 + }, + { + "auxiliary_loss_clip": 0.01117376, + "auxiliary_loss_mlp": 0.01104407, + "balance_loss_clip": 1.00154495, + "balance_loss_mlp": 1.00045657, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.9087834746391332, + "language_loss": 0.59281766, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61503547, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.5926876068115234 + }, + { + "auxiliary_loss_clip": 0.01133585, + "auxiliary_loss_mlp": 0.01104497, + "balance_loss_clip": 1.00187325, + "balance_loss_mlp": 1.00054622, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.5654842628447476, + "language_loss": 0.72967982, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75206065, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.6356611251831055 + }, + { + "auxiliary_loss_clip": 0.01086905, + "auxiliary_loss_mlp": 0.01104269, + "balance_loss_clip": 1.00169802, + "balance_loss_mlp": 1.0005089, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 3.4588478429457696, + "language_loss": 0.71930307, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74121481, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.665912389755249 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01104239, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00057423, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.6740741372450203, + "language_loss": 0.75875604, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.78114069, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.59540057182312 + }, + { + "auxiliary_loss_clip": 0.01100371, + "auxiliary_loss_mlp": 0.01103041, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00052071, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.2650774430289378, + "language_loss": 0.77484429, + "learning_rate": 4.12335575223518e-07, + "loss": 0.7968784, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.7244458198547363 + }, + { + "auxiliary_loss_clip": 0.01148542, + "auxiliary_loss_mlp": 0.01104392, + "balance_loss_clip": 1.00173056, + "balance_loss_mlp": 1.00063229, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.880374924141419, + "language_loss": 0.64117801, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66370738, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.6348812580108643 + }, + { + "auxiliary_loss_clip": 0.01117321, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_clip": 1.00170147, + "balance_loss_mlp": 1.00053024, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5377445966461478, + "language_loss": 0.60959554, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63179928, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.6546435356140137 + }, + { + "auxiliary_loss_clip": 0.01134045, + "auxiliary_loss_mlp": 0.01104146, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00057721, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.7193080972326618, + "language_loss": 0.79292083, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81530273, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.6077654361724854 + }, + { + "auxiliary_loss_clip": 0.01131873, + "auxiliary_loss_mlp": 0.0110356, + "balance_loss_clip": 1.00165832, + "balance_loss_mlp": 1.00056386, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.7707053736258072, + "language_loss": 0.63263983, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65499413, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.5953280925750732 + }, + { + "auxiliary_loss_clip": 0.01150335, + "auxiliary_loss_mlp": 0.01101758, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00038254, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.4712225319188938, + "language_loss": 0.71060634, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73312724, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.6302506923675537 + }, + { + "auxiliary_loss_clip": 0.01118843, + "auxiliary_loss_mlp": 0.0110434, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.0004847, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.7582743867058204, + "language_loss": 0.6266464, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64887828, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.713844060897827 + }, + { + "auxiliary_loss_clip": 0.01150526, + "auxiliary_loss_mlp": 0.01105146, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00062382, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.9337046927580823, + "language_loss": 0.80174083, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82429755, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.6019270420074463 + }, + { + "auxiliary_loss_clip": 0.01118609, + "auxiliary_loss_mlp": 0.00747403, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00036764, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.9718672523731775, + "language_loss": 0.71717906, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73583913, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.618579149246216 + }, + { + "auxiliary_loss_clip": 0.0114829, + "auxiliary_loss_mlp": 0.0110421, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00054538, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 2.177395687826834, + "language_loss": 0.7358799, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75840491, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.5266153812408447 + }, + { + "auxiliary_loss_clip": 0.01136127, + "auxiliary_loss_mlp": 0.01103469, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00056756, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.4769118132478447, + "language_loss": 0.7016924, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72408831, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.594514846801758 + }, + { + "auxiliary_loss_clip": 0.01134034, + "auxiliary_loss_mlp": 0.01102916, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00039637, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.6357911539383891, + "language_loss": 0.7337693, + "learning_rate": 4.097339136128437e-07, + "loss": 0.7561388, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 3.961425542831421 + }, + { + "auxiliary_loss_clip": 0.01131645, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00041389, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 1.6168760037326244, + "language_loss": 0.75030255, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77265596, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.5946946144104004 + }, + { + "auxiliary_loss_clip": 0.01133602, + "auxiliary_loss_mlp": 0.01103792, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.0004133, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.59865538124761, + "language_loss": 0.61541152, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63778543, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.682398557662964 + }, + { + "auxiliary_loss_clip": 0.01148522, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 1.925318135002753, + "language_loss": 0.70323122, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72575206, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 2.6228129863739014 + }, + { + "auxiliary_loss_clip": 0.01115186, + "auxiliary_loss_mlp": 0.01102707, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.00056875, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 1.9541916849548284, + "language_loss": 0.62212473, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64430368, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 2.5930869579315186 + }, + { + "auxiliary_loss_clip": 0.0115003, + "auxiliary_loss_mlp": 0.01104129, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 1.8946490639465459, + "language_loss": 0.71353263, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73607421, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.5831356048583984 + }, + { + "auxiliary_loss_clip": 0.01115544, + "auxiliary_loss_mlp": 0.01103156, + "balance_loss_clip": 1.00154245, + "balance_loss_mlp": 1.0004456, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5772058689474413, + "language_loss": 0.63321751, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65540457, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.6394002437591553 + }, + { + "auxiliary_loss_clip": 0.01148472, + "auxiliary_loss_mlp": 0.01103114, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00049901, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.6527190291012233, + "language_loss": 0.56035662, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58287245, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.677680015563965 + }, + { + "auxiliary_loss_clip": 0.01132189, + "auxiliary_loss_mlp": 0.01104423, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00066364, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.794863762596977, + "language_loss": 0.72076386, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.74312997, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.8091824054718018 + }, + { + "auxiliary_loss_clip": 0.0111924, + "auxiliary_loss_mlp": 0.01104669, + "balance_loss_clip": 1.00191343, + "balance_loss_mlp": 1.0005281, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.7031813709049772, + "language_loss": 0.72313857, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74537772, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.622056484222412 + }, + { + "auxiliary_loss_clip": 0.01121015, + "auxiliary_loss_mlp": 0.01103494, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00059247, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.9453995575345238, + "language_loss": 0.76421511, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78646016, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.596256971359253 + }, + { + "auxiliary_loss_clip": 0.01112545, + "auxiliary_loss_mlp": 0.01080087, + "balance_loss_clip": 1.00124359, + "balance_loss_mlp": 0.99997884, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6958195398274885, + "language_loss": 0.60833681, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.63026321, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 3.2597408294677734 + }, + { + "auxiliary_loss_clip": 0.01131745, + "auxiliary_loss_mlp": 0.01103741, + "balance_loss_clip": 1.00174296, + "balance_loss_mlp": 1.000458, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.1531404024931105, + "language_loss": 0.70718312, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72953802, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 3.9744107723236084 + }, + { + "auxiliary_loss_clip": 0.01118902, + "auxiliary_loss_mlp": 0.01104146, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00057662, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 2.2861078818545213, + "language_loss": 0.75833696, + "learning_rate": 4.066686308212037e-07, + "loss": 0.78056741, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 5.493755578994751 + }, + { + "auxiliary_loss_clip": 0.01133619, + "auxiliary_loss_mlp": 0.01103321, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00060987, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.641263690934516, + "language_loss": 0.77621174, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79858112, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.6256449222564697 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01105211, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.0005933, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 2.034574148328808, + "language_loss": 0.6356231, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65788114, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.657759666442871 + }, + { + "auxiliary_loss_clip": 0.01150416, + "auxiliary_loss_mlp": 0.01103781, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00049758, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.5225519689477038, + "language_loss": 0.72004867, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74259061, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.600303888320923 + }, + { + "auxiliary_loss_clip": 0.01165225, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.0004046, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 2.4978487514245726, + "language_loss": 0.83485425, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85398149, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 2.541811227798462 + }, + { + "auxiliary_loss_clip": 0.01165101, + "auxiliary_loss_mlp": 0.0110326, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00054932, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.6433743580633229, + "language_loss": 0.58717161, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60985518, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.566880226135254 + }, + { + "auxiliary_loss_clip": 0.01165163, + "auxiliary_loss_mlp": 0.01103635, + "balance_loss_clip": 1.00172997, + "balance_loss_mlp": 1.00044727, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.64702940649654, + "language_loss": 0.69172096, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71440893, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.5317604541778564 + }, + { + "auxiliary_loss_clip": 0.01118815, + "auxiliary_loss_mlp": 0.0110336, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00055408, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5877506377988277, + "language_loss": 0.69078171, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71300352, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.606912136077881 + }, + { + "auxiliary_loss_clip": 0.01148595, + "auxiliary_loss_mlp": 0.01103628, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00063109, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.3614874041125664, + "language_loss": 0.69639182, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71891409, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.6263155937194824 + }, + { + "auxiliary_loss_clip": 0.01135644, + "auxiliary_loss_mlp": 0.01103819, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00044024, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.867498594037379, + "language_loss": 0.76558828, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78798288, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.565469264984131 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01105399, + "balance_loss_clip": 1.00174487, + "balance_loss_mlp": 1.00039935, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.700284342156991, + "language_loss": 0.78569424, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80775607, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.7424561977386475 + }, + { + "auxiliary_loss_clip": 0.01160783, + "auxiliary_loss_mlp": 0.01080503, + "balance_loss_clip": 1.00128257, + "balance_loss_mlp": 1.00001323, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.8694738365846839, + "language_loss": 0.64736688, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66977978, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.019517660140991 + }, + { + "auxiliary_loss_clip": 0.01150081, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00065804, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 1.7712049553857216, + "language_loss": 0.82602179, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.84855914, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.5810933113098145 + }, + { + "auxiliary_loss_clip": 0.01150412, + "auxiliary_loss_mlp": 0.01103715, + "balance_loss_clip": 1.00187647, + "balance_loss_mlp": 1.00052714, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.5779875590048382, + "language_loss": 0.66397452, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68651587, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.5367724895477295 + }, + { + "auxiliary_loss_clip": 0.01165271, + "auxiliary_loss_mlp": 0.01103987, + "balance_loss_clip": 1.00191152, + "balance_loss_mlp": 1.00060904, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.6066562243280125, + "language_loss": 0.75148237, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77417493, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.5291695594787598 + }, + { + "auxiliary_loss_clip": 0.0115056, + "auxiliary_loss_mlp": 0.01103721, + "balance_loss_clip": 1.00182474, + "balance_loss_mlp": 1.00062919, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.2819201518073258, + "language_loss": 0.75600922, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77855194, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.602116346359253 + }, + { + "auxiliary_loss_clip": 0.01095187, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_clip": 1.00082397, + "balance_loss_mlp": 1.00005615, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8394594136390673, + "language_loss": 0.53833884, + "learning_rate": 4.029099944131522e-07, + "loss": 0.5600962, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.111407995223999 + }, + { + "auxiliary_loss_clip": 0.01131595, + "auxiliary_loss_mlp": 0.01102655, + "balance_loss_clip": 1.00169241, + "balance_loss_mlp": 1.00051618, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.8005681411480379, + "language_loss": 0.71233827, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73468077, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.6892151832580566 + }, + { + "auxiliary_loss_clip": 0.01131993, + "auxiliary_loss_mlp": 0.01104452, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00050163, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.8887730034762782, + "language_loss": 0.6485281, + "learning_rate": 4.024412542272706e-07, + "loss": 0.67089248, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.6731245517730713 + }, + { + "auxiliary_loss_clip": 0.01160783, + "auxiliary_loss_mlp": 0.01080489, + "balance_loss_clip": 1.00128269, + "balance_loss_mlp": 0.99999887, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7618183505718156, + "language_loss": 0.59005427, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61246693, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.1298274993896484 + }, + { + "auxiliary_loss_clip": 0.01116876, + "auxiliary_loss_mlp": 0.01103355, + "balance_loss_clip": 1.00169683, + "balance_loss_mlp": 1.00045311, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.5076826468249143, + "language_loss": 0.66389716, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68609941, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 2.6193087100982666 + }, + { + "auxiliary_loss_clip": 0.01165249, + "auxiliary_loss_mlp": 0.00747433, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00047863, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.949892956763446, + "language_loss": 0.74259806, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76172489, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.5266551971435547 + }, + { + "auxiliary_loss_clip": 0.01148426, + "auxiliary_loss_mlp": 0.01104124, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00036454, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 2.061879416701268, + "language_loss": 0.80555975, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82808524, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.597303867340088 + }, + { + "auxiliary_loss_clip": 0.01085244, + "auxiliary_loss_mlp": 0.01102479, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.00053108, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 2.0514284747769045, + "language_loss": 0.65837032, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68024755, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 4.105406284332275 + }, + { + "auxiliary_loss_clip": 0.01150467, + "auxiliary_loss_mlp": 0.011036, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00050735, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.6916142267240961, + "language_loss": 0.77924973, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80179036, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.5315113067626953 + }, + { + "auxiliary_loss_clip": 0.01165254, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00189054, + "balance_loss_mlp": 1.00047922, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.8794361376630464, + "language_loss": 0.70912325, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73182297, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.5341885089874268 + }, + { + "auxiliary_loss_clip": 0.01114946, + "auxiliary_loss_mlp": 0.01104683, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00044632, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.7368073018154724, + "language_loss": 0.76514113, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78733742, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.6483256816864014 + }, + { + "auxiliary_loss_clip": 0.01146036, + "auxiliary_loss_mlp": 0.01102369, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.0006125, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.544984603943013, + "language_loss": 0.79924405, + "learning_rate": 4.003349231059898e-07, + "loss": 0.82172811, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.593040943145752 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01103225, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00051379, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 2.060608001399296, + "language_loss": 0.66439724, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68692881, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 2.582501173019409 + }, + { + "auxiliary_loss_clip": 0.01148319, + "auxiliary_loss_mlp": 0.01103269, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.0005579, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.6107074327202007, + "language_loss": 0.73435682, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75687265, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 2.5564751625061035 + }, + { + "auxiliary_loss_clip": 0.01098196, + "auxiliary_loss_mlp": 0.01104698, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00046098, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 1.7607830585703417, + "language_loss": 0.73744404, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75947297, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 2.6794590950012207 + }, + { + "auxiliary_loss_clip": 0.01145936, + "auxiliary_loss_mlp": 0.01080473, + "balance_loss_clip": 1.0012573, + "balance_loss_mlp": 0.99998254, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.6967712781332902, + "language_loss": 0.52991003, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55217409, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.2375688552856445 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01104757, + "balance_loss_clip": 1.00164616, + "balance_loss_mlp": 1.00061607, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.6084830267336765, + "language_loss": 0.72474229, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74710971, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.5920608043670654 + }, + { + "auxiliary_loss_clip": 0.01148701, + "auxiliary_loss_mlp": 0.0110361, + "balance_loss_clip": 1.00170505, + "balance_loss_mlp": 1.0004226, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.8617071656427495, + "language_loss": 0.77398664, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79650974, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.571228265762329 + }, + { + "auxiliary_loss_clip": 0.01165273, + "auxiliary_loss_mlp": 0.01104135, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00056553, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 1.7231055210522916, + "language_loss": 0.83275032, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85544443, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.5055301189422607 + }, + { + "auxiliary_loss_clip": 0.01132925, + "auxiliary_loss_mlp": 0.01104331, + "balance_loss_clip": 1.00170755, + "balance_loss_mlp": 1.0003804, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.817197372374289, + "language_loss": 0.73215747, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75453007, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.577256441116333 + }, + { + "auxiliary_loss_clip": 0.01118169, + "auxiliary_loss_mlp": 0.01104166, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00050116, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 9.629548324216362, + "language_loss": 0.74508625, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76730967, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.6049177646636963 + }, + { + "auxiliary_loss_clip": 0.01117258, + "auxiliary_loss_mlp": 0.01103844, + "balance_loss_clip": 1.00164044, + "balance_loss_mlp": 1.00046623, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.9376500555733036, + "language_loss": 0.75158358, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77379465, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 5.457827568054199 + }, + { + "auxiliary_loss_clip": 0.01115447, + "auxiliary_loss_mlp": 0.01105495, + "balance_loss_clip": 1.00160813, + "balance_loss_mlp": 1.00049543, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 1.891621895501568, + "language_loss": 0.7474094, + "learning_rate": 3.977671915907068e-07, + "loss": 0.76961881, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 4.028342008590698 + }, + { + "auxiliary_loss_clip": 0.01086405, + "auxiliary_loss_mlp": 0.00747449, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00036871, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.669440104335903, + "language_loss": 0.80184978, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82018828, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.7959988117218018 + }, + { + "auxiliary_loss_clip": 0.01117103, + "auxiliary_loss_mlp": 0.01103353, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.0004518, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 2.3582352923400833, + "language_loss": 0.74363017, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76583481, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.6383419036865234 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.01102743, + "balance_loss_clip": 1.0017128, + "balance_loss_mlp": 1.00041354, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.520470682550138, + "language_loss": 0.78903651, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81154633, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.5777335166931152 + }, + { + "auxiliary_loss_clip": 0.01118035, + "auxiliary_loss_mlp": 0.01103484, + "balance_loss_clip": 1.00166988, + "balance_loss_mlp": 1.00048733, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.6039371355029903, + "language_loss": 0.6801995, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70241475, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.7101922035217285 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01080111, + "balance_loss_clip": 1.00121927, + "balance_loss_mlp": 1.0000025, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8083696212433485, + "language_loss": 0.61592865, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63804626, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 3.1146249771118164 + }, + { + "auxiliary_loss_clip": 0.01134542, + "auxiliary_loss_mlp": 0.01105065, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00073338, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 2.606049411424785, + "language_loss": 0.63910067, + "learning_rate": 3.963697086102522e-07, + "loss": 0.6614967, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.6244280338287354 + }, + { + "auxiliary_loss_clip": 0.01131539, + "auxiliary_loss_mlp": 0.01103027, + "balance_loss_clip": 1.00172246, + "balance_loss_mlp": 1.00041127, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.8267841089077084, + "language_loss": 0.69052225, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71286792, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 2.5923972129821777 + }, + { + "auxiliary_loss_clip": 0.01148619, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00054121, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.7288963195764426, + "language_loss": 0.70023656, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72275722, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.6283316612243652 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01080542, + "balance_loss_clip": 1.00128317, + "balance_loss_mlp": 1.0000515, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8812985217256878, + "language_loss": 0.62912989, + "learning_rate": 3.956717879334059e-07, + "loss": 0.65123367, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.207491874694824 + }, + { + "auxiliary_loss_clip": 0.01133527, + "auxiliary_loss_mlp": 0.01103682, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00059009, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.406196099255114, + "language_loss": 0.72305673, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74542886, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.6469321250915527 + }, + { + "auxiliary_loss_clip": 0.01148485, + "auxiliary_loss_mlp": 0.01103674, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00039101, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 2.839801598349389, + "language_loss": 0.72702014, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74954176, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.534806966781616 + }, + { + "auxiliary_loss_clip": 0.01129791, + "auxiliary_loss_mlp": 0.01103333, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00043106, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 2.0547041195730036, + "language_loss": 0.75778091, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78011215, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.6255743503570557 + }, + { + "auxiliary_loss_clip": 0.01165322, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00066638, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.114837685860757, + "language_loss": 0.83292687, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85561955, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 2.531576156616211 + }, + { + "auxiliary_loss_clip": 0.01148466, + "auxiliary_loss_mlp": 0.01102566, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00061846, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.8664731545287918, + "language_loss": 0.71182019, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73433053, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.554868698120117 + }, + { + "auxiliary_loss_clip": 0.01119519, + "auxiliary_loss_mlp": 0.01103244, + "balance_loss_clip": 1.00193918, + "balance_loss_mlp": 1.00043833, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.568034606714909, + "language_loss": 0.61868644, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.64091402, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.6839678287506104 + }, + { + "auxiliary_loss_clip": 0.01148109, + "auxiliary_loss_mlp": 0.01104157, + "balance_loss_clip": 1.00193226, + "balance_loss_mlp": 1.00058794, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.7536200709091305, + "language_loss": 0.76543951, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78796214, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.5738155841827393 + }, + { + "auxiliary_loss_clip": 0.01084288, + "auxiliary_loss_mlp": 0.01106191, + "balance_loss_clip": 1.00175834, + "balance_loss_mlp": 1.00052381, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.0727688096747108, + "language_loss": 0.73241115, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75431597, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.7196686267852783 + }, + { + "auxiliary_loss_clip": 0.01117141, + "auxiliary_loss_mlp": 0.01104204, + "balance_loss_clip": 1.00154352, + "balance_loss_mlp": 1.00053918, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 1.827717650246876, + "language_loss": 0.66030425, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68251771, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.604536533355713 + }, + { + "auxiliary_loss_clip": 0.01119142, + "auxiliary_loss_mlp": 0.01104927, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.000404, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.076873461104564, + "language_loss": 0.68450046, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70674121, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.8824963569641113 + }, + { + "auxiliary_loss_clip": 0.0110203, + "auxiliary_loss_mlp": 0.01102641, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00050259, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.4777364385826117, + "language_loss": 0.77403557, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79608226, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.694852590560913 + }, + { + "auxiliary_loss_clip": 0.01117301, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_clip": 1.00158501, + "balance_loss_mlp": 1.00044334, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.3873552901051458, + "language_loss": 0.77048957, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.792696, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.680537223815918 + }, + { + "auxiliary_loss_clip": 0.01150352, + "auxiliary_loss_mlp": 0.01103541, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00044918, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.47404591411117, + "language_loss": 0.84483725, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86737621, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.578542470932007 + }, + { + "auxiliary_loss_clip": 0.0113188, + "auxiliary_loss_mlp": 0.01102746, + "balance_loss_clip": 1.00168872, + "balance_loss_mlp": 1.00051248, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 2.1216493965082446, + "language_loss": 0.7339083, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75625455, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 4.093255281448364 + }, + { + "auxiliary_loss_clip": 0.0116523, + "auxiliary_loss_mlp": 0.01103512, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00061083, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.7207239958043332, + "language_loss": 0.69437623, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71706367, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 2.5472447872161865 + }, + { + "auxiliary_loss_clip": 0.01165251, + "auxiliary_loss_mlp": 0.01104753, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00061166, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.5629849204645527, + "language_loss": 0.70357603, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72627604, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.564850091934204 + }, + { + "auxiliary_loss_clip": 0.01165452, + "auxiliary_loss_mlp": 0.01106041, + "balance_loss_clip": 1.00191963, + "balance_loss_mlp": 1.00056469, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.355618091177395, + "language_loss": 0.78741294, + "learning_rate": 3.91727253254452e-07, + "loss": 0.81012785, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.5228216648101807 + }, + { + "auxiliary_loss_clip": 0.01147958, + "auxiliary_loss_mlp": 0.01103841, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00046265, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.288060816519856, + "language_loss": 0.74794608, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77046406, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 2.606541395187378 + }, + { + "auxiliary_loss_clip": 0.01148759, + "auxiliary_loss_mlp": 0.01103695, + "balance_loss_clip": 1.0017612, + "balance_loss_mlp": 1.00041211, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 1.985639126816291, + "language_loss": 0.60415852, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62668312, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.618964195251465 + }, + { + "auxiliary_loss_clip": 0.01134351, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.705200374690562, + "language_loss": 0.66260457, + "learning_rate": 3.910329872447706e-07, + "loss": 0.6849879, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.605485439300537 + }, + { + "auxiliary_loss_clip": 0.01165154, + "auxiliary_loss_mlp": 0.0110437, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00051475, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.0208460003408675, + "language_loss": 0.75304973, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77574492, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 2.513315200805664 + }, + { + "auxiliary_loss_clip": 0.0116513, + "auxiliary_loss_mlp": 0.01102703, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00037408, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.5105707716100198, + "language_loss": 0.74383295, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76651132, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.5525221824645996 + }, + { + "auxiliary_loss_clip": 0.01165283, + "auxiliary_loss_mlp": 0.01104098, + "balance_loss_clip": 1.00185955, + "balance_loss_mlp": 1.00052941, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 2.069454621468926, + "language_loss": 0.70074332, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72343719, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.491593837738037 + }, + { + "auxiliary_loss_clip": 0.01135641, + "auxiliary_loss_mlp": 0.01103095, + "balance_loss_clip": 1.00169611, + "balance_loss_mlp": 1.00048018, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.6126072472718005, + "language_loss": 0.74089289, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76328027, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.604937791824341 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01105069, + "balance_loss_clip": 1.00171983, + "balance_loss_mlp": 1.00064158, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.4720724997587187, + "language_loss": 0.87182128, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89420962, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.545271635055542 + }, + { + "auxiliary_loss_clip": 0.01148534, + "auxiliary_loss_mlp": 0.0110404, + "balance_loss_clip": 1.00166631, + "balance_loss_mlp": 1.00056648, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 2.208163124659135, + "language_loss": 0.74466753, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76719326, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.5534675121307373 + }, + { + "auxiliary_loss_clip": 0.01101428, + "auxiliary_loss_mlp": 0.01103437, + "balance_loss_clip": 1.00151885, + "balance_loss_mlp": 1.00043976, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.0989233719865648, + "language_loss": 0.79017949, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81222808, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.6948840618133545 + }, + { + "auxiliary_loss_clip": 0.01135746, + "auxiliary_loss_mlp": 0.01102923, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00049853, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.5371035414719192, + "language_loss": 0.74584281, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76822954, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 5.522322416305542 + }, + { + "auxiliary_loss_clip": 0.01101635, + "auxiliary_loss_mlp": 0.01104985, + "balance_loss_clip": 1.00155783, + "balance_loss_mlp": 1.00055766, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 27.57124694912226, + "language_loss": 0.6866194, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70868558, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 4.117604970932007 + }, + { + "auxiliary_loss_clip": 0.01113771, + "auxiliary_loss_mlp": 0.01080457, + "balance_loss_clip": 1.00112844, + "balance_loss_mlp": 0.9999668, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7223838308352349, + "language_loss": 0.55663699, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57857931, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.2767555713653564 + }, + { + "auxiliary_loss_clip": 0.0109969, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_clip": 1.00187206, + "balance_loss_mlp": 1.00050282, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 4.002634063803578, + "language_loss": 0.72652948, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.7485652, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.6766226291656494 + }, + { + "auxiliary_loss_clip": 0.01148375, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 1.0003525, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.7497291587277668, + "language_loss": 0.70224047, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72476435, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.5767290592193604 + }, + { + "auxiliary_loss_clip": 0.0115056, + "auxiliary_loss_mlp": 0.01103526, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00052929, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.4167047242582542, + "language_loss": 0.69408607, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71662694, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 2.6779463291168213 + }, + { + "auxiliary_loss_clip": 0.01165406, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00045848, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.719753803946013, + "language_loss": 0.76355815, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78625727, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.546335220336914 + }, + { + "auxiliary_loss_clip": 0.01102195, + "auxiliary_loss_mlp": 0.01103132, + "balance_loss_clip": 1.00165915, + "balance_loss_mlp": 1.00032568, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 1.6588891472701777, + "language_loss": 0.6911962, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71324944, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.674376964569092 + }, + { + "auxiliary_loss_clip": 0.01150098, + "auxiliary_loss_mlp": 0.01104007, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.0005331, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 2.1931665792452106, + "language_loss": 0.64019859, + "learning_rate": 3.873395148176135e-07, + "loss": 0.66273969, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.5641162395477295 + }, + { + "auxiliary_loss_clip": 0.01131728, + "auxiliary_loss_mlp": 0.01103575, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00057769, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 3.422587074804842, + "language_loss": 0.76261377, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78496683, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.657729148864746 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01104379, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00061882, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.901919700825513, + "language_loss": 0.6997605, + "learning_rate": 3.868789307701381e-07, + "loss": 0.72230566, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.6121370792388916 + }, + { + "auxiliary_loss_clip": 0.01148099, + "auxiliary_loss_mlp": 0.01104409, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00045884, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.2678079752455194, + "language_loss": 0.79029191, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81281704, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.5319371223449707 + }, + { + "auxiliary_loss_clip": 0.01165283, + "auxiliary_loss_mlp": 0.01105005, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.00067258, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.629666388663179, + "language_loss": 0.72093141, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74363428, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.538360834121704 + }, + { + "auxiliary_loss_clip": 0.01123124, + "auxiliary_loss_mlp": 0.01080101, + "balance_loss_clip": 1.00130427, + "balance_loss_mlp": 0.99999285, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6673864487471413, + "language_loss": 0.51242775, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53446001, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.2139132022857666 + }, + { + "auxiliary_loss_clip": 0.01165141, + "auxiliary_loss_mlp": 0.01104232, + "balance_loss_clip": 1.00174427, + "balance_loss_mlp": 1.00047207, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.693780892177935, + "language_loss": 0.7387464, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76144016, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.549448013305664 + }, + { + "auxiliary_loss_clip": 0.0111733, + "auxiliary_loss_mlp": 0.01103531, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00043905, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.335943094330199, + "language_loss": 0.71463799, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73684657, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.709874391555786 + }, + { + "auxiliary_loss_clip": 0.01131909, + "auxiliary_loss_mlp": 0.01103891, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00051308, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.0239705621591377, + "language_loss": 0.83476245, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.85712045, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.559145927429199 + }, + { + "auxiliary_loss_clip": 0.0114654, + "auxiliary_loss_mlp": 0.01080068, + "balance_loss_clip": 1.00135636, + "balance_loss_mlp": 0.99995977, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.8515707913809334, + "language_loss": 0.55555964, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57782578, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.1273019313812256 + }, + { + "auxiliary_loss_clip": 0.01148233, + "auxiliary_loss_mlp": 0.01102888, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00046384, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.6525657750731566, + "language_loss": 0.84504628, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86755753, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.5177242755889893 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.01103197, + "balance_loss_clip": 1.00167513, + "balance_loss_mlp": 1.00058174, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.5089591189422897, + "language_loss": 0.70574617, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72794437, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.701042413711548 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00044036, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 1.9091980582528414, + "language_loss": 0.76136065, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78387618, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.5877580642700195 + }, + { + "auxiliary_loss_clip": 0.01165139, + "auxiliary_loss_mlp": 0.01103777, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00068474, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.5634245028946099, + "language_loss": 0.6511839, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67387301, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.553232192993164 + }, + { + "auxiliary_loss_clip": 0.01146189, + "auxiliary_loss_mlp": 0.01080454, + "balance_loss_clip": 1.00125468, + "balance_loss_mlp": 0.99996406, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7396625911865976, + "language_loss": 0.57369506, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59596151, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.2545082569122314 + }, + { + "auxiliary_loss_clip": 0.01148426, + "auxiliary_loss_mlp": 0.01103379, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00047779, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.9570881131920848, + "language_loss": 0.77226955, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79478759, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.5577311515808105 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.01103714, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00043058, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.7842128696968946, + "language_loss": 0.70591211, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72843707, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 3.9321041107177734 + }, + { + "auxiliary_loss_clip": 0.01133282, + "auxiliary_loss_mlp": 0.01103883, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00040936, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.1530778331085796, + "language_loss": 0.68711209, + "learning_rate": 3.834323543710805e-07, + "loss": 0.70948374, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.5931992530822754 + }, + { + "auxiliary_loss_clip": 0.0116525, + "auxiliary_loss_mlp": 0.01103825, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00073254, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.3034271149821546, + "language_loss": 0.72013128, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74282205, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 2.5030057430267334 + }, + { + "auxiliary_loss_clip": 0.01150306, + "auxiliary_loss_mlp": 0.0110336, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00036359, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.749788595293938, + "language_loss": 0.64483738, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66737401, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.606147527694702 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01104129, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00046444, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.008321445984119, + "language_loss": 0.84108794, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86363494, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 2.556532859802246 + }, + { + "auxiliary_loss_clip": 0.01100976, + "auxiliary_loss_mlp": 0.01104477, + "balance_loss_clip": 1.00158167, + "balance_loss_mlp": 1.00052667, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.9430922600623015, + "language_loss": 0.67942762, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70148218, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.6713225841522217 + }, + { + "auxiliary_loss_clip": 0.01118661, + "auxiliary_loss_mlp": 0.00747073, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00040269, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.5246872394098523, + "language_loss": 0.85019791, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86885524, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.72924542427063 + }, + { + "auxiliary_loss_clip": 0.01113439, + "auxiliary_loss_mlp": 0.011026, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00046158, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.5027131663421387, + "language_loss": 0.7033757, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72553605, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.7202723026275635 + }, + { + "auxiliary_loss_clip": 0.01150529, + "auxiliary_loss_mlp": 0.01103044, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00042915, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 3.163530492638621, + "language_loss": 0.75053322, + "learning_rate": 3.818286703948788e-07, + "loss": 0.7730689, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.597454309463501 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01103946, + "balance_loss_clip": 1.00178993, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.467636243822835, + "language_loss": 0.76163083, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78415388, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.635434627532959 + }, + { + "auxiliary_loss_clip": 0.01133814, + "auxiliary_loss_mlp": 0.00747348, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00036645, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.881232272634533, + "language_loss": 0.7379483, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75675988, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.599832773208618 + }, + { + "auxiliary_loss_clip": 0.01133331, + "auxiliary_loss_mlp": 0.01103783, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00059605, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.7393119964364705, + "language_loss": 0.70605004, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72842115, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.6172144412994385 + }, + { + "auxiliary_loss_clip": 0.01165164, + "auxiliary_loss_mlp": 0.01103402, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00040555, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.067949115518391, + "language_loss": 0.76603925, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78872496, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.4996132850646973 + }, + { + "auxiliary_loss_clip": 0.01148531, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00059474, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 5.270853766111362, + "language_loss": 0.68726683, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70978332, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 4.0277276039123535 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01103853, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00056994, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.702647652209332, + "language_loss": 0.68087029, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70325136, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 5.405944585800171 + }, + { + "auxiliary_loss_clip": 0.01148568, + "auxiliary_loss_mlp": 0.01104369, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00051343, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.5677977365425704, + "language_loss": 0.81344891, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83597833, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.56697678565979 + }, + { + "auxiliary_loss_clip": 0.01150297, + "auxiliary_loss_mlp": 0.01104228, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00056386, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.8603649208997144, + "language_loss": 0.84532619, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.86787146, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.5485174655914307 + }, + { + "auxiliary_loss_clip": 0.01134845, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00056529, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.7127472243575352, + "language_loss": 0.66935933, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69174623, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.5641825199127197 + }, + { + "auxiliary_loss_clip": 0.01116851, + "auxiliary_loss_mlp": 0.0110221, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00045276, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.6731493485072062, + "language_loss": 0.76268697, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78487754, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.6030685901641846 + }, + { + "auxiliary_loss_clip": 0.0115076, + "auxiliary_loss_mlp": 0.01104806, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00075984, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.470085571195692, + "language_loss": 0.65364236, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67619801, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.571678400039673 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.01104051, + "balance_loss_clip": 1.00174069, + "balance_loss_mlp": 1.00067258, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 2.5551973186485064, + "language_loss": 0.80816305, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.83039129, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.662121057510376 + }, + { + "auxiliary_loss_clip": 0.01132887, + "auxiliary_loss_mlp": 0.011044, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.589496703536973, + "language_loss": 0.85024738, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87262028, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.5744571685791016 + }, + { + "auxiliary_loss_clip": 0.01115359, + "auxiliary_loss_mlp": 0.00747501, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00040245, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.7157652169079758, + "language_loss": 0.75744474, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77607334, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.6911613941192627 + }, + { + "auxiliary_loss_clip": 0.01150658, + "auxiliary_loss_mlp": 0.00747464, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00044322, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.7107634097803208, + "language_loss": 0.78478873, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80376995, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.5728678703308105 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01103448, + "balance_loss_clip": 1.001719, + "balance_loss_mlp": 1.00035548, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.7220191561632097, + "language_loss": 0.79714787, + "learning_rate": 3.78174402269098e-07, + "loss": 0.8193329, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 2.638132095336914 + }, + { + "auxiliary_loss_clip": 0.01165013, + "auxiliary_loss_mlp": 0.01103502, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00050509, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.7033880856641408, + "language_loss": 0.68084621, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70353132, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.530980110168457 + }, + { + "auxiliary_loss_clip": 0.0113257, + "auxiliary_loss_mlp": 0.01104147, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00067353, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.7857671879886994, + "language_loss": 0.8013407, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82370788, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.5998494625091553 + }, + { + "auxiliary_loss_clip": 0.0114843, + "auxiliary_loss_mlp": 0.01104841, + "balance_loss_clip": 1.00169849, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.4772668075151378, + "language_loss": 0.78776205, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81029475, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.6387839317321777 + }, + { + "auxiliary_loss_clip": 0.01133889, + "auxiliary_loss_mlp": 0.01104157, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00068367, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.4778145325537047, + "language_loss": 0.75493217, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77731264, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 2.594747543334961 + }, + { + "auxiliary_loss_clip": 0.01148454, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00040555, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.6290711813028986, + "language_loss": 0.72971356, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75223303, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.5715765953063965 + }, + { + "auxiliary_loss_clip": 0.01085679, + "auxiliary_loss_mlp": 0.0110338, + "balance_loss_clip": 1.00152421, + "balance_loss_mlp": 1.00047827, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.4996191293548555, + "language_loss": 0.69670212, + "learning_rate": 3.768081088042774e-07, + "loss": 0.71859276, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.6770126819610596 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01103721, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00034285, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 2.2784108576810325, + "language_loss": 0.74586654, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76825595, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.5678789615631104 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00064743, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 1.829579757503036, + "language_loss": 0.66937447, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69188386, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 2.5968141555786133 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01103576, + "balance_loss_clip": 1.00162435, + "balance_loss_mlp": 1.00038838, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7734259387385725, + "language_loss": 0.79972267, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82195103, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.592167377471924 + }, + { + "auxiliary_loss_clip": 0.01131762, + "auxiliary_loss_mlp": 0.01103726, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00044346, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 1.7324733647962136, + "language_loss": 0.80467236, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82702726, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.5694470405578613 + }, + { + "auxiliary_loss_clip": 0.01116912, + "auxiliary_loss_mlp": 0.01104566, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00052071, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.422226178790887, + "language_loss": 0.70305276, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72526753, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.6286354064941406 + }, + { + "auxiliary_loss_clip": 0.01131923, + "auxiliary_loss_mlp": 0.01103216, + "balance_loss_clip": 1.0015564, + "balance_loss_mlp": 1.00041032, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.4605245968648466, + "language_loss": 0.72409606, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74644744, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.7382304668426514 + }, + { + "auxiliary_loss_clip": 0.0111843, + "auxiliary_loss_mlp": 0.01103931, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.00045693, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 2.312315110768357, + "language_loss": 0.67938912, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70161271, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.624722957611084 + }, + { + "auxiliary_loss_clip": 0.01099225, + "auxiliary_loss_mlp": 0.01104229, + "balance_loss_clip": 1.00164533, + "balance_loss_mlp": 1.00046909, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.5846182685120336, + "language_loss": 0.75470066, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77673519, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 4.04206919670105 + }, + { + "auxiliary_loss_clip": 0.01165016, + "auxiliary_loss_mlp": 0.0110343, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00052857, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.6658681406956362, + "language_loss": 0.70311117, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72579563, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.5786473751068115 + }, + { + "auxiliary_loss_clip": 0.01129566, + "auxiliary_loss_mlp": 0.01103856, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.0005728, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.645378905856744, + "language_loss": 0.72804129, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75037545, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.6420772075653076 + }, + { + "auxiliary_loss_clip": 0.01131823, + "auxiliary_loss_mlp": 0.01103774, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00068188, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 2.03010905846224, + "language_loss": 0.77134323, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79369915, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.5862162113189697 + }, + { + "auxiliary_loss_clip": 0.01165166, + "auxiliary_loss_mlp": 0.01103398, + "balance_loss_clip": 1.00199556, + "balance_loss_mlp": 1.00049675, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.4883554698635044, + "language_loss": 0.78621352, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80889916, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 2.565824031829834 + }, + { + "auxiliary_loss_clip": 0.01132152, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.00167155, + "balance_loss_mlp": 1.0003562, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 1.9971516176087265, + "language_loss": 0.59156555, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61036205, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.563650131225586 + }, + { + "auxiliary_loss_clip": 0.0115029, + "auxiliary_loss_mlp": 0.01103494, + "balance_loss_clip": 1.00183642, + "balance_loss_mlp": 1.00059247, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 1.860900930724668, + "language_loss": 0.76231343, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78485131, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.5336081981658936 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01103943, + "balance_loss_clip": 1.00170636, + "balance_loss_mlp": 1.00056469, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.725100779615729, + "language_loss": 0.70718694, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72954309, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.7352328300476074 + }, + { + "auxiliary_loss_clip": 0.01117246, + "auxiliary_loss_mlp": 0.01103575, + "balance_loss_clip": 1.00192344, + "balance_loss_mlp": 1.00057864, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.7106899412331413, + "language_loss": 0.82536012, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84756827, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 2.6325933933258057 + }, + { + "auxiliary_loss_clip": 0.01109846, + "auxiliary_loss_mlp": 0.00746565, + "balance_loss_clip": 1.00127077, + "balance_loss_mlp": 1.00114715, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8552267399132498, + "language_loss": 0.53685403, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55541813, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 3.108011245727539 + }, + { + "auxiliary_loss_clip": 0.01116916, + "auxiliary_loss_mlp": 0.01103912, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00053382, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.2548467516743793, + "language_loss": 0.71749216, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.73970044, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.6224470138549805 + }, + { + "auxiliary_loss_clip": 0.01134134, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_clip": 1.00185287, + "balance_loss_mlp": 1.00057483, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.7746396246816747, + "language_loss": 0.71048021, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73286492, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.602614402770996 + }, + { + "auxiliary_loss_clip": 0.01085668, + "auxiliary_loss_mlp": 0.0110532, + "balance_loss_clip": 1.00172913, + "balance_loss_mlp": 1.00051141, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 1.8539696896004567, + "language_loss": 0.74558628, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.76749623, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.705692768096924 + }, + { + "auxiliary_loss_clip": 0.01160705, + "auxiliary_loss_mlp": 0.01080469, + "balance_loss_clip": 1.00129282, + "balance_loss_mlp": 0.99997866, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7788343707121013, + "language_loss": 0.63860667, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.66101837, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 5.969010591506958 + }, + { + "auxiliary_loss_clip": 0.01150491, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.0004369, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.624777322024814, + "language_loss": 0.73827773, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76081502, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 4.002661228179932 + }, + { + "auxiliary_loss_clip": 0.01133117, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00168359, + "balance_loss_mlp": 1.00039446, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.5732354905333366, + "language_loss": 0.74118865, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75999403, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.571408987045288 + }, + { + "auxiliary_loss_clip": 0.01136, + "auxiliary_loss_mlp": 0.01104774, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00044227, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.6249877355656923, + "language_loss": 0.80492342, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82733113, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.5801548957824707 + }, + { + "auxiliary_loss_clip": 0.01148533, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00188458, + "balance_loss_mlp": 1.00057769, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 1.9546212297326953, + "language_loss": 0.7848314, + "learning_rate": 3.711390917482875e-07, + "loss": 0.8073554, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.6107211112976074 + }, + { + "auxiliary_loss_clip": 0.011021, + "auxiliary_loss_mlp": 0.01103427, + "balance_loss_clip": 1.00158083, + "balance_loss_mlp": 1.00043046, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.090977358723682, + "language_loss": 0.76950181, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79155719, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.665750026702881 + }, + { + "auxiliary_loss_clip": 0.01116944, + "auxiliary_loss_mlp": 0.01103608, + "balance_loss_clip": 1.00165308, + "balance_loss_mlp": 1.00051546, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.8538064271274555, + "language_loss": 0.7681393, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.79034483, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.6681270599365234 + }, + { + "auxiliary_loss_clip": 0.01133838, + "auxiliary_loss_mlp": 0.01103535, + "balance_loss_clip": 1.00170982, + "balance_loss_mlp": 1.0004431, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.833025332511378, + "language_loss": 0.79016459, + "learning_rate": 3.70461401253471e-07, + "loss": 0.81253839, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.5493290424346924 + }, + { + "auxiliary_loss_clip": 0.01165283, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_clip": 1.00200188, + "balance_loss_mlp": 1.00055873, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.8131167193991322, + "language_loss": 0.71454704, + "learning_rate": 3.702356279949801e-07, + "loss": 0.73723829, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.5641191005706787 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01102988, + "balance_loss_clip": 1.00175214, + "balance_loss_mlp": 1.00046766, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 4.348275588627958, + "language_loss": 0.72572297, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74808526, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.6088740825653076 + }, + { + "auxiliary_loss_clip": 0.01148594, + "auxiliary_loss_mlp": 0.0110414, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00057077, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 2.2426670588681374, + "language_loss": 0.78553069, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80805796, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.515895128250122 + }, + { + "auxiliary_loss_clip": 0.01131696, + "auxiliary_loss_mlp": 0.01104396, + "balance_loss_clip": 1.00181425, + "balance_loss_mlp": 1.00044525, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 1.8315767116241486, + "language_loss": 0.80065703, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82301795, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.5674381256103516 + }, + { + "auxiliary_loss_clip": 0.01133363, + "auxiliary_loss_mlp": 0.01104471, + "balance_loss_clip": 1.0017277, + "balance_loss_mlp": 1.00052071, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.7012251739749336, + "language_loss": 0.84625489, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86863327, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 2.594660520553589 + }, + { + "auxiliary_loss_clip": 0.01148651, + "auxiliary_loss_mlp": 0.01104917, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00068021, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.677814082484799, + "language_loss": 0.76145142, + "learning_rate": 3.69107688886096e-07, + "loss": 0.7839871, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.5947978496551514 + }, + { + "auxiliary_loss_clip": 0.01133892, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00049424, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 1.6773752255927437, + "language_loss": 0.83005035, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85243183, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.648764133453369 + }, + { + "auxiliary_loss_clip": 0.01165147, + "auxiliary_loss_mlp": 0.01103545, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00054812, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 1.6783523152548068, + "language_loss": 0.6201762, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64286315, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.4841153621673584 + }, + { + "auxiliary_loss_clip": 0.01165058, + "auxiliary_loss_mlp": 0.01103069, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00054836, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.4354589970570435, + "language_loss": 0.61840177, + "learning_rate": 3.684316674755341e-07, + "loss": 0.64108306, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.566965103149414 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.011034, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00068879, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.582264640643859, + "language_loss": 0.81720895, + "learning_rate": 3.682064507324256e-07, + "loss": 0.83972764, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.5214293003082275 + }, + { + "auxiliary_loss_clip": 0.01134987, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00034428, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.8576318597905077, + "language_loss": 0.76651585, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78533989, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.619140625 + }, + { + "auxiliary_loss_clip": 0.01118595, + "auxiliary_loss_mlp": 0.01103762, + "balance_loss_clip": 1.00162077, + "balance_loss_mlp": 1.00047863, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.6926397165441434, + "language_loss": 0.7937789, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81600249, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.629720449447632 + }, + { + "auxiliary_loss_clip": 0.0115032, + "auxiliary_loss_mlp": 0.01102649, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00051022, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.7889012118633951, + "language_loss": 0.6803745, + "learning_rate": 3.675311718038978e-07, + "loss": 0.70290422, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.521944046020508 + }, + { + "auxiliary_loss_clip": 0.01112862, + "auxiliary_loss_mlp": 0.01080124, + "balance_loss_clip": 1.00114775, + "balance_loss_mlp": 1.00001585, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6888277195004099, + "language_loss": 0.54653525, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56846511, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.2720608711242676 + }, + { + "auxiliary_loss_clip": 0.01165168, + "auxiliary_loss_mlp": 0.01103464, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00046754, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.6171535481707406, + "language_loss": 0.6932981, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71598446, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.4950549602508545 + }, + { + "auxiliary_loss_clip": 0.01149595, + "auxiliary_loss_mlp": 0.01104093, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00042808, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.7573137259088496, + "language_loss": 0.79538018, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.81791699, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.5680148601531982 + }, + { + "auxiliary_loss_clip": 0.01146578, + "auxiliary_loss_mlp": 0.01080462, + "balance_loss_clip": 1.00137353, + "balance_loss_mlp": 0.99997205, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.749911290248725, + "language_loss": 0.57741916, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59968948, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 4.421412229537964 + }, + { + "auxiliary_loss_clip": 0.01101096, + "auxiliary_loss_mlp": 0.01104768, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00043654, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.6076561611875257, + "language_loss": 0.74061418, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76267284, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.6447324752807617 + }, + { + "auxiliary_loss_clip": 0.01148801, + "auxiliary_loss_mlp": 0.01104186, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00061679, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.7592376511600323, + "language_loss": 0.78862357, + "learning_rate": 3.661822855683723e-07, + "loss": 0.81115347, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.5232532024383545 + }, + { + "auxiliary_loss_clip": 0.01148356, + "auxiliary_loss_mlp": 0.01103248, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00053692, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.5374396544390256, + "language_loss": 0.75175434, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77427042, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.5483624935150146 + }, + { + "auxiliary_loss_clip": 0.01133834, + "auxiliary_loss_mlp": 0.01104485, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00063002, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.056516728487099, + "language_loss": 0.73953772, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76192093, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.5244319438934326 + }, + { + "auxiliary_loss_clip": 0.01131477, + "auxiliary_loss_mlp": 0.01104219, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00055504, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 1.841713713279165, + "language_loss": 0.69874591, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72110283, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.564314365386963 + }, + { + "auxiliary_loss_clip": 0.01160734, + "auxiliary_loss_mlp": 0.01080434, + "balance_loss_clip": 1.00127947, + "balance_loss_mlp": 0.99994391, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6865423364996263, + "language_loss": 0.52139741, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54380912, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 3.035968780517578 + }, + { + "auxiliary_loss_clip": 0.01100193, + "auxiliary_loss_mlp": 0.01104582, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00044107, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.6307343462742407, + "language_loss": 0.71376073, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73580849, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.6696228981018066 + }, + { + "auxiliary_loss_clip": 0.01165316, + "auxiliary_loss_mlp": 0.01104645, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00069475, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.5752853166914722, + "language_loss": 0.78927517, + "learning_rate": 3.648356296957327e-07, + "loss": 0.8119747, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.5547921657562256 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00061226, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.8138080378717911, + "language_loss": 0.72303915, + "learning_rate": 3.646114040202548e-07, + "loss": 0.7453981, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.565218448638916 + }, + { + "auxiliary_loss_clip": 0.01079964, + "auxiliary_loss_mlp": 0.01102817, + "balance_loss_clip": 1.00146198, + "balance_loss_mlp": 1.00039208, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.0802343086436976, + "language_loss": 0.65558559, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.6774134, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 2.6403141021728516 + }, + { + "auxiliary_loss_clip": 0.01135415, + "auxiliary_loss_mlp": 0.01103603, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00051081, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.6257899537410592, + "language_loss": 0.76342583, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78581595, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.5783917903900146 + }, + { + "auxiliary_loss_clip": 0.01134376, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.0005722, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 2.0580258526864945, + "language_loss": 0.72213113, + "learning_rate": 3.639390991124183e-07, + "loss": 0.7445268, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.5974671840667725 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00051904, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.8727712492307491, + "language_loss": 0.75874811, + "learning_rate": 3.637151215443308e-07, + "loss": 0.78099263, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 4.095415830612183 + }, + { + "auxiliary_loss_clip": 0.01131868, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.0006206, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 1.8761366976854266, + "language_loss": 0.72257638, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74494267, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 4.040979862213135 + }, + { + "auxiliary_loss_clip": 0.01084811, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_clip": 1.00156403, + "balance_loss_mlp": 1.00044, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.6751553795359657, + "language_loss": 0.84109807, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86296821, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.73380970954895 + }, + { + "auxiliary_loss_clip": 0.01165298, + "auxiliary_loss_mlp": 0.01104141, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00047708, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.849184246632573, + "language_loss": 0.74040884, + "learning_rate": 3.630435611625502e-07, + "loss": 0.76310325, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 2.4950883388519287 + }, + { + "auxiliary_loss_clip": 0.01099158, + "auxiliary_loss_mlp": 0.00747306, + "balance_loss_clip": 1.00154901, + "balance_loss_mlp": 1.00039601, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.5965146714098373, + "language_loss": 0.71645337, + "learning_rate": 3.628198318377453e-07, + "loss": 0.734918, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.6578662395477295 + }, + { + "auxiliary_loss_clip": 0.01118748, + "auxiliary_loss_mlp": 0.01104883, + "balance_loss_clip": 1.00166512, + "balance_loss_mlp": 1.00064647, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 1.8384309707046944, + "language_loss": 0.71299899, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73523527, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.59877610206604 + }, + { + "auxiliary_loss_clip": 0.01165113, + "auxiliary_loss_mlp": 0.01103608, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00042057, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.3505640280073314, + "language_loss": 0.67662412, + "learning_rate": 3.623725594427245e-07, + "loss": 0.69931132, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 2.5212576389312744 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01103792, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.6539691613593595, + "language_loss": 0.72012371, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74216664, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.6898674964904785 + }, + { + "auxiliary_loss_clip": 0.01150741, + "auxiliary_loss_mlp": 0.01104794, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.00065231, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.5269689452251731, + "language_loss": 0.70605093, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72860628, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.5862743854522705 + }, + { + "auxiliary_loss_clip": 0.01148005, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_clip": 1.00178421, + "balance_loss_mlp": 1.00050175, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.0093902916094466, + "language_loss": 0.76544678, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78797513, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.5339508056640625 + }, + { + "auxiliary_loss_clip": 0.01133686, + "auxiliary_loss_mlp": 0.01103742, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00055492, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.791667617923731, + "language_loss": 0.79939306, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82176733, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.6181538105010986 + }, + { + "auxiliary_loss_clip": 0.01150405, + "auxiliary_loss_mlp": 0.0110433, + "balance_loss_clip": 1.00187552, + "balance_loss_mlp": 1.00047457, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.5227903720963296, + "language_loss": 0.70976001, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73230737, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.546358346939087 + }, + { + "auxiliary_loss_clip": 0.01118556, + "auxiliary_loss_mlp": 0.01104115, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00045085, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 2.93518693808776, + "language_loss": 0.76941013, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79163682, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.7229294776916504 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01103262, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00064683, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 2.3315475523839653, + "language_loss": 0.84287959, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86556351, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.485858201980591 + }, + { + "auxiliary_loss_clip": 0.0111904, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00040519, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.4900179192975982, + "language_loss": 0.76199019, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78422129, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.6189956665039062 + }, + { + "auxiliary_loss_clip": 0.01128902, + "auxiliary_loss_mlp": 0.01080138, + "balance_loss_clip": 1.0011245, + "balance_loss_mlp": 1.00002909, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8045945231447551, + "language_loss": 0.59960151, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62169194, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.1899948120117188 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01102862, + "balance_loss_clip": 1.00175357, + "balance_loss_mlp": 1.0004375, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.5022088646763216, + "language_loss": 0.79001272, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81252378, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.561490535736084 + }, + { + "auxiliary_loss_clip": 0.01135293, + "auxiliary_loss_mlp": 0.01103814, + "balance_loss_clip": 1.00178373, + "balance_loss_mlp": 1.00062597, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8896137948911709, + "language_loss": 0.71091986, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73331088, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.5446879863739014 + }, + { + "auxiliary_loss_clip": 0.01133968, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.0004797, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.6065091105831166, + "language_loss": 0.6769827, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69936961, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.801694393157959 + }, + { + "auxiliary_loss_clip": 0.01131452, + "auxiliary_loss_mlp": 0.01104426, + "balance_loss_clip": 1.00164104, + "balance_loss_mlp": 1.00047553, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 2.3987025174518863, + "language_loss": 0.74696875, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76932758, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.8303134441375732 + }, + { + "auxiliary_loss_clip": 0.01131328, + "auxiliary_loss_mlp": 0.011044, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00044918, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.03312938847926, + "language_loss": 0.72661507, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.74897236, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.676105499267578 + }, + { + "auxiliary_loss_clip": 0.01165276, + "auxiliary_loss_mlp": 0.01104252, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.0004921, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 3.0000906680787938, + "language_loss": 0.76206303, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78475833, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.4870924949645996 + }, + { + "auxiliary_loss_clip": 0.01165324, + "auxiliary_loss_mlp": 0.01105186, + "balance_loss_clip": 1.00181317, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.6484282609845677, + "language_loss": 0.70277584, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72548091, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.5126192569732666 + }, + { + "auxiliary_loss_clip": 0.0114856, + "auxiliary_loss_mlp": 0.01103721, + "balance_loss_clip": 1.00184858, + "balance_loss_mlp": 1.00062871, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.677832143217486, + "language_loss": 0.75763929, + "learning_rate": 3.585807799107785e-07, + "loss": 0.7801621, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 2.526370048522949 + }, + { + "auxiliary_loss_clip": 0.01165204, + "auxiliary_loss_mlp": 0.01104145, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00057566, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.6597754728305105, + "language_loss": 0.77181274, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79450619, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.582432985305786 + }, + { + "auxiliary_loss_clip": 0.01150502, + "auxiliary_loss_mlp": 0.0110464, + "balance_loss_clip": 1.00174356, + "balance_loss_mlp": 1.00049853, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7525241571826422, + "language_loss": 0.70199776, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72454917, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 3.8668742179870605 + }, + { + "auxiliary_loss_clip": 0.01133747, + "auxiliary_loss_mlp": 0.01103993, + "balance_loss_clip": 1.00173032, + "balance_loss_mlp": 1.00051904, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.9080922118801147, + "language_loss": 0.79639524, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81877267, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.5796828269958496 + }, + { + "auxiliary_loss_clip": 0.01148443, + "auxiliary_loss_mlp": 0.01104028, + "balance_loss_clip": 1.00175583, + "balance_loss_mlp": 1.00045896, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.5282689310726054, + "language_loss": 0.63351059, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.6560353, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.9247283935546875 + }, + { + "auxiliary_loss_clip": 0.01100207, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_clip": 1.00153768, + "balance_loss_mlp": 1.00046074, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.732133791181257, + "language_loss": 0.71451586, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.7365582, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.6380183696746826 + }, + { + "auxiliary_loss_clip": 0.01115168, + "auxiliary_loss_mlp": 0.01103192, + "balance_loss_clip": 1.0015527, + "balance_loss_mlp": 1.000386, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.7782689577602109, + "language_loss": 0.63045853, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65264213, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 2.691009521484375 + }, + { + "auxiliary_loss_clip": 0.01164837, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.00038004, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.5076584878562294, + "language_loss": 0.75158304, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77070487, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.4911627769470215 + }, + { + "auxiliary_loss_clip": 0.01097692, + "auxiliary_loss_mlp": 0.01104449, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.0005939, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 2.238406202774225, + "language_loss": 0.91327822, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93529963, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.659738540649414 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00177372, + "balance_loss_mlp": 1.00037146, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.4064096242922812, + "language_loss": 0.7905634, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80952048, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 2.5691826343536377 + }, + { + "auxiliary_loss_clip": 0.01150452, + "auxiliary_loss_mlp": 0.01104066, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00059199, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.4959523528697634, + "language_loss": 0.7884956, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81104076, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.575662851333618 + }, + { + "auxiliary_loss_clip": 0.01165169, + "auxiliary_loss_mlp": 0.01103879, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.00059652, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.6000795217169226, + "language_loss": 0.70303178, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72572231, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.5618960857391357 + }, + { + "auxiliary_loss_clip": 0.01134883, + "auxiliary_loss_mlp": 0.01104481, + "balance_loss_clip": 1.0017885, + "balance_loss_mlp": 1.00053024, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.4147302701984983, + "language_loss": 0.72983885, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.75223249, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.5697696208953857 + }, + { + "auxiliary_loss_clip": 0.01149802, + "auxiliary_loss_mlp": 0.01103644, + "balance_loss_clip": 1.00170529, + "balance_loss_mlp": 1.00045681, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.5853845883929867, + "language_loss": 0.69683635, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.71937084, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.592693567276001 + }, + { + "auxiliary_loss_clip": 0.01148373, + "auxiliary_loss_mlp": 0.01103039, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00061417, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.4209854290641997, + "language_loss": 0.69990575, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72241986, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.522606611251831 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_clip": 1.00184131, + "balance_loss_mlp": 1.00039124, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 1.9155949491772923, + "language_loss": 0.71230525, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.7348386, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 5.319593667984009 + }, + { + "auxiliary_loss_clip": 0.01148428, + "auxiliary_loss_mlp": 0.01103085, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00037372, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.8908946792108654, + "language_loss": 0.62600249, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64851761, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.586623430252075 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01103567, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00057018, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.7384234693677432, + "language_loss": 0.6584217, + "learning_rate": 3.548069885262628e-07, + "loss": 0.68075144, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 4.027392148971558 + }, + { + "auxiliary_loss_clip": 0.01131679, + "auxiliary_loss_mlp": 0.01102982, + "balance_loss_clip": 1.00173128, + "balance_loss_mlp": 1.00046217, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.6363844148220348, + "language_loss": 0.75115418, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.7735008, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.6152164936065674 + }, + { + "auxiliary_loss_clip": 0.01165075, + "auxiliary_loss_mlp": 0.01104173, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00041318, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.6088846255925737, + "language_loss": 0.70820475, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.73089719, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 2.532776117324829 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01103301, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.00058997, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.7868108546112833, + "language_loss": 0.6895861, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71227062, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.454881429672241 + }, + { + "auxiliary_loss_clip": 0.01145936, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00047588, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.898962469883971, + "language_loss": 0.7762996, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79878891, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.5607662200927734 + }, + { + "auxiliary_loss_clip": 0.01148262, + "auxiliary_loss_mlp": 0.0110349, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00058889, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 1.6988501930578364, + "language_loss": 0.82008809, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84260559, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.507035493850708 + }, + { + "auxiliary_loss_clip": 0.01135548, + "auxiliary_loss_mlp": 0.0110431, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00045443, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 1.8452994006088628, + "language_loss": 0.71759087, + "learning_rate": 3.534793646536065e-07, + "loss": 0.7399894, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.512129545211792 + }, + { + "auxiliary_loss_clip": 0.01118507, + "auxiliary_loss_mlp": 0.01103446, + "balance_loss_clip": 1.0017395, + "balance_loss_mlp": 1.00054407, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.691054482930854, + "language_loss": 0.76357454, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78579408, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.594135046005249 + }, + { + "auxiliary_loss_clip": 0.01165272, + "auxiliary_loss_mlp": 0.00747487, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00040114, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 3.3080699979282424, + "language_loss": 0.76673186, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78585947, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01146058, + "auxiliary_loss_mlp": 0.01103511, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00051463, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 1.891894186080061, + "language_loss": 0.93368554, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95618117, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.496628761291504 + }, + { + "auxiliary_loss_clip": 0.01115639, + "auxiliary_loss_mlp": 0.01103605, + "balance_loss_clip": 1.00172496, + "balance_loss_mlp": 1.00051248, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.5644261462927294, + "language_loss": 0.70577347, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72796589, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.6454405784606934 + }, + { + "auxiliary_loss_clip": 0.01117983, + "auxiliary_loss_mlp": 0.01104097, + "balance_loss_clip": 1.00173414, + "balance_loss_mlp": 1.00052762, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.4725811294849285, + "language_loss": 0.75588268, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77810353, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.6491754055023193 + }, + { + "auxiliary_loss_clip": 0.01135411, + "auxiliary_loss_mlp": 0.01102883, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00045884, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 2.837605480274686, + "language_loss": 0.76603556, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78841853, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.5932862758636475 + }, + { + "auxiliary_loss_clip": 0.01148583, + "auxiliary_loss_mlp": 0.01103892, + "balance_loss_clip": 1.00162673, + "balance_loss_mlp": 1.00051379, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.7600590468111423, + "language_loss": 0.78211606, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80464077, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.526458501815796 + }, + { + "auxiliary_loss_clip": 0.01103986, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.00173283, + "balance_loss_mlp": 1.00049794, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 6.941054115122537, + "language_loss": 0.66659665, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68866479, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.7747325897216797 + }, + { + "auxiliary_loss_clip": 0.01150371, + "auxiliary_loss_mlp": 0.01103502, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00050509, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.4828813625352621, + "language_loss": 0.67408764, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69662642, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.608168840408325 + }, + { + "auxiliary_loss_clip": 0.0116504, + "auxiliary_loss_mlp": 0.01104019, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 2.1566036771095813, + "language_loss": 0.68772751, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71041811, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.5273537635803223 + }, + { + "auxiliary_loss_clip": 0.01165227, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00056171, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.3327763408134374, + "language_loss": 0.79087377, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81357396, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.4303362369537354 + }, + { + "auxiliary_loss_clip": 0.01116288, + "auxiliary_loss_mlp": 0.01105418, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00070405, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.518915875062533, + "language_loss": 0.77563304, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79785007, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.5738401412963867 + }, + { + "auxiliary_loss_clip": 0.01165519, + "auxiliary_loss_mlp": 0.01105403, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00040293, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 2.829168158149837, + "language_loss": 0.73495096, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75766015, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.4437663555145264 + }, + { + "auxiliary_loss_clip": 0.0115027, + "auxiliary_loss_mlp": 0.01101925, + "balance_loss_clip": 1.00179327, + "balance_loss_mlp": 1.00045395, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.5172684891598502, + "language_loss": 0.76756114, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.79008311, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.561250925064087 + }, + { + "auxiliary_loss_clip": 0.0114852, + "auxiliary_loss_mlp": 0.01104117, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.000453, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.052237150240636, + "language_loss": 0.70356143, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72608781, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.5654549598693848 + }, + { + "auxiliary_loss_clip": 0.01165199, + "auxiliary_loss_mlp": 0.01105356, + "balance_loss_clip": 1.00187528, + "balance_loss_mlp": 1.00064254, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.9218276270832981, + "language_loss": 0.70855558, + "learning_rate": 3.49950028014111e-07, + "loss": 0.73126113, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.515833854675293 + }, + { + "auxiliary_loss_clip": 0.01148533, + "auxiliary_loss_mlp": 0.01104495, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00044918, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.7398171318822695, + "language_loss": 0.77089739, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79342765, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 3.911620616912842 + }, + { + "auxiliary_loss_clip": 0.01165232, + "auxiliary_loss_mlp": 0.01104344, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00058436, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 2.2842177121257596, + "language_loss": 0.71465248, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.7373482, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.477700710296631 + }, + { + "auxiliary_loss_clip": 0.01148328, + "auxiliary_loss_mlp": 0.01103334, + "balance_loss_clip": 1.00177085, + "balance_loss_mlp": 1.00052774, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.8260795126357678, + "language_loss": 0.71778971, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74030632, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.5080583095550537 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01105179, + "balance_loss_clip": 1.00159049, + "balance_loss_mlp": 1.00046527, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 5.685237734338865, + "language_loss": 0.68414801, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70619023, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.616400718688965 + }, + { + "auxiliary_loss_clip": 0.01165199, + "auxiliary_loss_mlp": 0.01104068, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00078487, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.6915487802342117, + "language_loss": 0.82409739, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84679008, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.4563536643981934 + }, + { + "auxiliary_loss_clip": 0.01150376, + "auxiliary_loss_mlp": 0.01104147, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00048256, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 2.53828581153212, + "language_loss": 0.67976737, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70231259, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.51462459564209 + }, + { + "auxiliary_loss_clip": 0.01135633, + "auxiliary_loss_mlp": 0.01104431, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00048018, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.7138058700206815, + "language_loss": 0.66105258, + "learning_rate": 3.484109781056723e-07, + "loss": 0.6834532, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.6489005088806152 + }, + { + "auxiliary_loss_clip": 0.01150534, + "auxiliary_loss_mlp": 0.01103759, + "balance_loss_clip": 1.00172746, + "balance_loss_mlp": 1.00057161, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.0589757811903784, + "language_loss": 0.73335034, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75589323, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.5010416507720947 + }, + { + "auxiliary_loss_clip": 0.01148372, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00037789, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 3.000174249497048, + "language_loss": 0.80369473, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82621121, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 2.5093131065368652 + }, + { + "auxiliary_loss_clip": 0.01133229, + "auxiliary_loss_mlp": 0.01103958, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00067532, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.668855171642796, + "language_loss": 0.65785754, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68022943, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.6145482063293457 + }, + { + "auxiliary_loss_clip": 0.01160694, + "auxiliary_loss_mlp": 0.01080432, + "balance_loss_clip": 1.00125384, + "balance_loss_mlp": 0.99994147, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.7918609960934505, + "language_loss": 0.56978816, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.59219939, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 3.0121684074401855 + }, + { + "auxiliary_loss_clip": 0.01131309, + "auxiliary_loss_mlp": 0.01080467, + "balance_loss_clip": 1.00125289, + "balance_loss_mlp": 0.99997741, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6943891332816979, + "language_loss": 0.55298483, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57510257, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 2.9858343601226807 + }, + { + "auxiliary_loss_clip": 0.01135467, + "auxiliary_loss_mlp": 0.01103444, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00054264, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.6304715758151798, + "language_loss": 0.6711365, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69352555, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.541358709335327 + }, + { + "auxiliary_loss_clip": 0.01148828, + "auxiliary_loss_mlp": 0.01104112, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00044739, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.518199445038728, + "language_loss": 0.81510067, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83763003, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 5.304166078567505 + }, + { + "auxiliary_loss_clip": 0.01133613, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.0019027, + "balance_loss_mlp": 1.00055063, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.5732982801163242, + "language_loss": 0.71922421, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74160349, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.624750852584839 + }, + { + "auxiliary_loss_clip": 0.01051269, + "auxiliary_loss_mlp": 0.0110324, + "balance_loss_clip": 1.00173295, + "balance_loss_mlp": 1.00043368, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.4897406572289367, + "language_loss": 0.69830728, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.71985233, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 4.317736864089966 + }, + { + "auxiliary_loss_clip": 0.01131953, + "auxiliary_loss_mlp": 0.01103947, + "balance_loss_clip": 1.00163603, + "balance_loss_mlp": 1.0004735, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.9024715888036499, + "language_loss": 0.7036804, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72603941, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 2.8526978492736816 + }, + { + "auxiliary_loss_clip": 0.01150386, + "auxiliary_loss_mlp": 0.01103373, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00066197, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.746308441014578, + "language_loss": 0.78919345, + "learning_rate": 3.459986724180188e-07, + "loss": 0.8117311, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 2.562643051147461 + }, + { + "auxiliary_loss_clip": 0.01133676, + "auxiliary_loss_mlp": 0.01102891, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00056195, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.8285522696005099, + "language_loss": 0.82611865, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84848428, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.5576107501983643 + }, + { + "auxiliary_loss_clip": 0.01165075, + "auxiliary_loss_mlp": 0.01102515, + "balance_loss_clip": 1.00186336, + "balance_loss_mlp": 1.00037646, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.140348669695115, + "language_loss": 0.79586482, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81854069, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 2.49653959274292 + }, + { + "auxiliary_loss_clip": 0.01133922, + "auxiliary_loss_mlp": 0.01102355, + "balance_loss_clip": 1.00168979, + "balance_loss_mlp": 1.00050282, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.8221373107213337, + "language_loss": 0.77044672, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79280949, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.5827648639678955 + }, + { + "auxiliary_loss_clip": 0.01165124, + "auxiliary_loss_mlp": 0.01103736, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00064421, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.6901579523402415, + "language_loss": 0.5870508, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60973942, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.524700403213501 + }, + { + "auxiliary_loss_clip": 0.01150764, + "auxiliary_loss_mlp": 0.01104672, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00062633, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.9538105674254318, + "language_loss": 0.82264662, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84520096, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.5660240650177 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00074029, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.4747868028944997, + "language_loss": 0.78934163, + "learning_rate": 3.446860673237142e-07, + "loss": 0.81172192, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.5132405757904053 + }, + { + "auxiliary_loss_clip": 0.0116517, + "auxiliary_loss_mlp": 0.01103637, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00064027, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.9671733651220358, + "language_loss": 0.64904362, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.67173171, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.54399037361145 + }, + { + "auxiliary_loss_clip": 0.01116804, + "auxiliary_loss_mlp": 0.01103088, + "balance_loss_clip": 1.00171041, + "balance_loss_mlp": 1.00066376, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.6335587625261578, + "language_loss": 0.75120574, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77340466, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.6548171043395996 + }, + { + "auxiliary_loss_clip": 0.01150409, + "auxiliary_loss_mlp": 0.01104398, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00054252, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 3.475133480323575, + "language_loss": 0.59629482, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61884284, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.639049768447876 + }, + { + "auxiliary_loss_clip": 0.0106847, + "auxiliary_loss_mlp": 0.01104366, + "balance_loss_clip": 1.00161684, + "balance_loss_mlp": 1.00041592, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.7919679201518244, + "language_loss": 0.74519348, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76692188, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.7420060634613037 + }, + { + "auxiliary_loss_clip": 0.011287, + "auxiliary_loss_mlp": 0.01080145, + "balance_loss_clip": 1.00113869, + "balance_loss_mlp": 1.00003612, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8220610505864017, + "language_loss": 0.58643627, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60852468, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.17175030708313 + }, + { + "auxiliary_loss_clip": 0.01099868, + "auxiliary_loss_mlp": 0.01103268, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00055766, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.4934297927227969, + "language_loss": 0.71114844, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73317981, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.653759479522705 + }, + { + "auxiliary_loss_clip": 0.01117215, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00051963, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.724041659208148, + "language_loss": 0.73634595, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75855517, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 2.6498281955718994 + }, + { + "auxiliary_loss_clip": 0.01165199, + "auxiliary_loss_mlp": 0.01103442, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00054097, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 2.170829231229158, + "language_loss": 0.79453313, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.8172195, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.511605739593506 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01103207, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 1.00068736, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.7906726220482578, + "language_loss": 0.68988025, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71209514, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.6810503005981445 + }, + { + "auxiliary_loss_clip": 0.01165272, + "auxiliary_loss_mlp": 0.01103506, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00050926, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.5595320951880893, + "language_loss": 0.59414589, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61683369, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.528306484222412 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.00174177, + "balance_loss_mlp": 1.00028205, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.422854635664074, + "language_loss": 0.82288796, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84153515, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.714228630065918 + }, + { + "auxiliary_loss_clip": 0.01135481, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.00188625, + "balance_loss_mlp": 1.00046659, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.5396815243887056, + "language_loss": 0.74497199, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76736331, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.5543882846832275 + }, + { + "auxiliary_loss_clip": 0.01148157, + "auxiliary_loss_mlp": 0.01104434, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00048399, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.5538680499366808, + "language_loss": 0.74428904, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76681495, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.5532474517822266 + }, + { + "auxiliary_loss_clip": 0.0113197, + "auxiliary_loss_mlp": 0.01104092, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.0006181, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.64691266978182, + "language_loss": 0.69621909, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71857977, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 3.9692063331604004 + }, + { + "auxiliary_loss_clip": 0.01084896, + "auxiliary_loss_mlp": 0.01102941, + "balance_loss_clip": 1.001616, + "balance_loss_mlp": 1.00061178, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.4977501389669485, + "language_loss": 0.60883164, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63071007, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.716130495071411 + }, + { + "auxiliary_loss_clip": 0.01148497, + "auxiliary_loss_mlp": 0.01103762, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00057459, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.1722157070120494, + "language_loss": 0.69580805, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71833062, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.587385654449463 + }, + { + "auxiliary_loss_clip": 0.01133884, + "auxiliary_loss_mlp": 0.01104154, + "balance_loss_clip": 1.00183308, + "balance_loss_mlp": 1.00058532, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.602783375413353, + "language_loss": 0.73072284, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75310326, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.575327157974243 + }, + { + "auxiliary_loss_clip": 0.01148263, + "auxiliary_loss_mlp": 0.01103803, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.000615, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.6653371292954697, + "language_loss": 0.73067433, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75319493, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.550992965698242 + }, + { + "auxiliary_loss_clip": 0.01165367, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00057518, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.9232946613081703, + "language_loss": 0.6509769, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67367774, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.586787700653076 + }, + { + "auxiliary_loss_clip": 0.0116523, + "auxiliary_loss_mlp": 0.0110401, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00053632, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 1.6736910424786213, + "language_loss": 0.68080932, + "learning_rate": 3.403270471641373e-07, + "loss": 0.7035017, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.500711441040039 + }, + { + "auxiliary_loss_clip": 0.01135247, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00038791, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 2.565175492122321, + "language_loss": 0.66615373, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68854475, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.609313726425171 + }, + { + "auxiliary_loss_clip": 0.01149823, + "auxiliary_loss_mlp": 0.01102512, + "balance_loss_clip": 1.00177968, + "balance_loss_mlp": 1.0005641, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 1.8031453388777785, + "language_loss": 0.69792175, + "learning_rate": 3.398925286280188e-07, + "loss": 0.72044516, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.579204559326172 + }, + { + "auxiliary_loss_clip": 0.01165266, + "auxiliary_loss_mlp": 0.01104769, + "balance_loss_clip": 1.00187743, + "balance_loss_mlp": 1.00053263, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.8096631105330387, + "language_loss": 0.66136748, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.68406785, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.5053303241729736 + }, + { + "auxiliary_loss_clip": 0.01112915, + "auxiliary_loss_mlp": 0.01104555, + "balance_loss_clip": 1.00166392, + "balance_loss_mlp": 1.00050962, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.6928217167146005, + "language_loss": 0.78720248, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80937725, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 2.61444091796875 + }, + { + "auxiliary_loss_clip": 0.01133561, + "auxiliary_loss_mlp": 0.01102409, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 1.0004611, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 2.823063093748022, + "language_loss": 0.58151621, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60387588, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.5628273487091064 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01102223, + "balance_loss_clip": 1.00149596, + "balance_loss_mlp": 1.00046563, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.7656205877183877, + "language_loss": 0.82582009, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84783864, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.6635425090789795 + }, + { + "auxiliary_loss_clip": 0.01067737, + "auxiliary_loss_mlp": 0.01103935, + "balance_loss_clip": 1.001513, + "balance_loss_mlp": 1.00055718, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 2.3862130231019987, + "language_loss": 0.82331121, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84502792, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 4.189488410949707 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.00156724, + "balance_loss_mlp": 1.00056243, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.6841353878682015, + "language_loss": 0.83804011, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86008734, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 4.095638036727905 + }, + { + "auxiliary_loss_clip": 0.0111487, + "auxiliary_loss_mlp": 0.01103484, + "balance_loss_clip": 1.00159776, + "balance_loss_mlp": 1.00048757, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.8077671867216532, + "language_loss": 0.73648858, + "learning_rate": 3.383736971541766e-07, + "loss": 0.75867212, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 4.023203611373901 + }, + { + "auxiliary_loss_clip": 0.01115084, + "auxiliary_loss_mlp": 0.01104259, + "balance_loss_clip": 1.001652, + "balance_loss_mlp": 1.00049877, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 1.9515192808115527, + "language_loss": 0.68318707, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70538044, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.5921683311462402 + }, + { + "auxiliary_loss_clip": 0.0111649, + "auxiliary_loss_mlp": 0.0110314, + "balance_loss_clip": 1.00167751, + "balance_loss_mlp": 1.00033355, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.125374951306283, + "language_loss": 0.83473223, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85692853, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 2.574223279953003 + }, + { + "auxiliary_loss_clip": 0.01098538, + "auxiliary_loss_mlp": 0.01103679, + "balance_loss_clip": 1.00171554, + "balance_loss_mlp": 1.00049114, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.6575824726692707, + "language_loss": 0.68799019, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71001232, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.694373607635498 + }, + { + "auxiliary_loss_clip": 0.01129463, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00059927, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.7275813747785764, + "language_loss": 0.73854733, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76087987, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.5716986656188965 + }, + { + "auxiliary_loss_clip": 0.01115594, + "auxiliary_loss_mlp": 0.01102799, + "balance_loss_clip": 1.00171995, + "balance_loss_mlp": 1.00066078, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.656348299533775, + "language_loss": 0.73931509, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76149905, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.589364767074585 + }, + { + "auxiliary_loss_clip": 0.01165063, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00056863, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.8587055192611455, + "language_loss": 0.6539315, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67661488, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.597015619277954 + }, + { + "auxiliary_loss_clip": 0.0113189, + "auxiliary_loss_mlp": 0.01103625, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00053287, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.7329901083698884, + "language_loss": 0.69935489, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72171003, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.6030828952789307 + }, + { + "auxiliary_loss_clip": 0.01150303, + "auxiliary_loss_mlp": 0.0110337, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00046873, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 2.099373634975913, + "language_loss": 0.79409754, + "learning_rate": 3.366416704613735e-07, + "loss": 0.8166343, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.6009163856506348 + }, + { + "auxiliary_loss_clip": 0.01114167, + "auxiliary_loss_mlp": 0.01079695, + "balance_loss_clip": 1.00129151, + "balance_loss_mlp": 0.99996835, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7419763150802898, + "language_loss": 0.55852103, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.58045959, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.266871690750122 + }, + { + "auxiliary_loss_clip": 0.01084904, + "auxiliary_loss_mlp": 0.00747168, + "balance_loss_clip": 1.00164247, + "balance_loss_mlp": 1.00034666, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.9536434988129592, + "language_loss": 0.77721798, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79553866, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.7461659908294678 + }, + { + "auxiliary_loss_clip": 0.0111524, + "auxiliary_loss_mlp": 0.01104954, + "balance_loss_clip": 1.00147331, + "balance_loss_mlp": 1.00043118, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 2.3459033818487773, + "language_loss": 0.77296364, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79516554, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.820563554763794 + }, + { + "auxiliary_loss_clip": 0.01115841, + "auxiliary_loss_mlp": 0.01103265, + "balance_loss_clip": 1.00159657, + "balance_loss_mlp": 1.00045896, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 1.8843678679309526, + "language_loss": 0.86074889, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88293993, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.6106178760528564 + }, + { + "auxiliary_loss_clip": 0.0114846, + "auxiliary_loss_mlp": 0.01103849, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00066185, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.5651173109468348, + "language_loss": 0.72842354, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75094664, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.576645612716675 + }, + { + "auxiliary_loss_clip": 0.01133763, + "auxiliary_loss_mlp": 0.01104275, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.00061083, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.4534957486092093, + "language_loss": 0.81038427, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83276474, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.6371922492980957 + }, + { + "auxiliary_loss_clip": 0.01150272, + "auxiliary_loss_mlp": 0.01103527, + "balance_loss_clip": 1.00172949, + "balance_loss_mlp": 1.00053024, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.7415156205772402, + "language_loss": 0.75635898, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77889699, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.564425230026245 + }, + { + "auxiliary_loss_clip": 0.01118605, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00157332, + "balance_loss_mlp": 1.00052738, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.8201074150716432, + "language_loss": 0.75685728, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77907002, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.6164019107818604 + }, + { + "auxiliary_loss_clip": 0.01131607, + "auxiliary_loss_mlp": 0.0110261, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00047112, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 11.146682145323958, + "language_loss": 0.68081284, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70315504, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.609931707382202 + }, + { + "auxiliary_loss_clip": 0.01132171, + "auxiliary_loss_mlp": 0.0110468, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00053847, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.6427227995564153, + "language_loss": 0.69923902, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72160757, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.6495449542999268 + }, + { + "auxiliary_loss_clip": 0.01082086, + "auxiliary_loss_mlp": 0.01103303, + "balance_loss_clip": 1.00166142, + "balance_loss_mlp": 1.00049651, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.7799353711276873, + "language_loss": 0.73778331, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75963724, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.7014973163604736 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.00747397, + "balance_loss_clip": 1.0016768, + "balance_loss_mlp": 1.00040925, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.519531058482889, + "language_loss": 0.75906962, + "learning_rate": 3.340512006973011e-07, + "loss": 0.77787715, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.6097302436828613 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01103773, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00049019, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 1.8648212654476108, + "language_loss": 0.65159655, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67399216, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.6077446937561035 + }, + { + "auxiliary_loss_clip": 0.01165099, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00041461, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 2.012609560411237, + "language_loss": 0.74870884, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77139306, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.4955015182495117 + }, + { + "auxiliary_loss_clip": 0.01133924, + "auxiliary_loss_mlp": 0.01103282, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00057113, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 1.692578918195787, + "language_loss": 0.63043118, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.6528033, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 4.13242244720459 + }, + { + "auxiliary_loss_clip": 0.01165009, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.0007025, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.8978775224094417, + "language_loss": 0.78184259, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.8045221, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.6117827892303467 + }, + { + "auxiliary_loss_clip": 0.01150548, + "auxiliary_loss_mlp": 0.00747598, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00043488, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 4.026272337931394, + "language_loss": 0.75661314, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77559459, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.5698912143707275 + }, + { + "auxiliary_loss_clip": 0.0114838, + "auxiliary_loss_mlp": 0.01102879, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00054979, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.467590381023089, + "language_loss": 0.73444366, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75695622, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.5917251110076904 + }, + { + "auxiliary_loss_clip": 0.01165302, + "auxiliary_loss_mlp": 0.01103946, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00056732, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.666277725769528, + "language_loss": 0.68618357, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.70887601, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 2.4783618450164795 + }, + { + "auxiliary_loss_clip": 0.01133646, + "auxiliary_loss_mlp": 0.0110399, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00061154, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.5299503252459214, + "language_loss": 0.85455334, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87692976, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.543367862701416 + }, + { + "auxiliary_loss_clip": 0.01165113, + "auxiliary_loss_mlp": 0.01103268, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00046158, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 1.9283831074814441, + "language_loss": 0.73993075, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76261461, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.4607808589935303 + }, + { + "auxiliary_loss_clip": 0.01150163, + "auxiliary_loss_mlp": 0.01104285, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00062084, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.9069632455985444, + "language_loss": 0.72137147, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74391592, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.504361867904663 + }, + { + "auxiliary_loss_clip": 0.01165069, + "auxiliary_loss_mlp": 0.01103019, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00059438, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5013362802197916, + "language_loss": 0.76567292, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.7883538, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.533008575439453 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.0110265, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.640307111910375, + "language_loss": 0.65531528, + "learning_rate": 3.314698278332588e-07, + "loss": 0.6776967, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.6159510612487793 + }, + { + "auxiliary_loss_clip": 0.01150211, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.00055575, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.458622701997363, + "language_loss": 0.75745714, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77998996, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 2.5817980766296387 + }, + { + "auxiliary_loss_clip": 0.01087222, + "auxiliary_loss_mlp": 0.00747383, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00040269, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 2.9306706230694894, + "language_loss": 0.82028413, + "learning_rate": 3.310404844338841e-07, + "loss": 0.8386302, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 2.719721555709839 + }, + { + "auxiliary_loss_clip": 0.01150319, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.0005101, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.8677009548568972, + "language_loss": 0.75941193, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78195119, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.5772593021392822 + }, + { + "auxiliary_loss_clip": 0.01117876, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00049877, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.072579363430598, + "language_loss": 0.81338698, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.8355912, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 3.9771668910980225 + }, + { + "auxiliary_loss_clip": 0.01148438, + "auxiliary_loss_mlp": 0.01103138, + "balance_loss_clip": 1.00196207, + "balance_loss_mlp": 1.00042725, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.1605412682991774, + "language_loss": 0.71020848, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73272419, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 5.5095226764678955 + }, + { + "auxiliary_loss_clip": 0.01102082, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_clip": 1.00157058, + "balance_loss_mlp": 1.00051475, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 1.7822586750250289, + "language_loss": 0.79400516, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81607062, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.6927783489227295 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01103087, + "balance_loss_clip": 1.00185287, + "balance_loss_mlp": 1.00047135, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.701573332403652, + "language_loss": 0.79006565, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81229192, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 2.661864757537842 + }, + { + "auxiliary_loss_clip": 0.01117072, + "auxiliary_loss_mlp": 0.01104675, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.00053382, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.8454729344202565, + "language_loss": 0.63030702, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65252447, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.755615711212158 + }, + { + "auxiliary_loss_clip": 0.01100521, + "auxiliary_loss_mlp": 0.01103521, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00052404, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.790189155958104, + "language_loss": 0.74017942, + "learning_rate": 3.295397765071055e-07, + "loss": 0.76221985, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.684544086456299 + }, + { + "auxiliary_loss_clip": 0.01131979, + "auxiliary_loss_mlp": 0.01103731, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00054383, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.5199425950941698, + "language_loss": 0.70342404, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72578114, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.647808074951172 + }, + { + "auxiliary_loss_clip": 0.01148714, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00053811, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.565135545522931, + "language_loss": 0.65542245, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67793727, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 2.56677508354187 + }, + { + "auxiliary_loss_clip": 0.01113193, + "auxiliary_loss_mlp": 0.0110447, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00080585, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.540419699109394, + "language_loss": 0.70811367, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.73029029, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.654632568359375 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.01102986, + "balance_loss_clip": 1.0016948, + "balance_loss_mlp": 1.00056136, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.7042032718820848, + "language_loss": 0.71528935, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73763424, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.5933611392974854 + }, + { + "auxiliary_loss_clip": 0.01135119, + "auxiliary_loss_mlp": 0.0110458, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00053406, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.1780819392928565, + "language_loss": 0.79066122, + "learning_rate": 3.284697424316132e-07, + "loss": 0.81305826, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.5768167972564697 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.0110325, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00063491, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.3441024996817146, + "language_loss": 0.68080819, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.7034902, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.546875 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.01103974, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.0004046, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.6307928118804456, + "language_loss": 0.80001843, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82239473, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.6266732215881348 + }, + { + "auxiliary_loss_clip": 0.01146097, + "auxiliary_loss_mlp": 0.01104299, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00053942, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.5879107467833806, + "language_loss": 0.68830556, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71080953, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.5702338218688965 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01104385, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00053024, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.2341397486196066, + "language_loss": 0.60789001, + "learning_rate": 3.276148560452001e-07, + "loss": 0.63027173, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 2.5389630794525146 + }, + { + "auxiliary_loss_clip": 0.01119154, + "auxiliary_loss_mlp": 0.00747435, + "balance_loss_clip": 1.00190032, + "balance_loss_mlp": 1.00044012, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 2.0192959942349393, + "language_loss": 0.71857595, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.73724186, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.63047194480896 + }, + { + "auxiliary_loss_clip": 0.01131636, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_clip": 1.00174248, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.951152308442276, + "language_loss": 0.7320655, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75440484, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.5528616905212402 + }, + { + "auxiliary_loss_clip": 0.01102385, + "auxiliary_loss_mlp": 0.01105206, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00049222, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 2.2113907719714776, + "language_loss": 0.6304009, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65247679, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.808159351348877 + }, + { + "auxiliary_loss_clip": 0.01120442, + "auxiliary_loss_mlp": 0.01103124, + "balance_loss_clip": 1.0016048, + "balance_loss_mlp": 1.00050902, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.4644811852592896, + "language_loss": 0.69880724, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72104287, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.643583059310913 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.0016619, + "balance_loss_mlp": 1.00057495, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.85600629207998, + "language_loss": 0.81937706, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84172142, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.5764670372009277 + }, + { + "auxiliary_loss_clip": 0.01133313, + "auxiliary_loss_mlp": 0.01102598, + "balance_loss_clip": 1.00167763, + "balance_loss_mlp": 1.00055468, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.0487672418008853, + "language_loss": 0.73987448, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76223361, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.528792142868042 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.01102476, + "balance_loss_clip": 1.0017637, + "balance_loss_mlp": 1.00052786, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.6890610154603383, + "language_loss": 0.55429667, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57651198, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.71443510055542 + }, + { + "auxiliary_loss_clip": 0.01082777, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_clip": 1.00165296, + "balance_loss_mlp": 1.00046003, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.4471814513085386, + "language_loss": 0.79030824, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81216764, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.666452169418335 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01102275, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00051749, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.52073805365049, + "language_loss": 0.59981787, + "learning_rate": 3.256950723599887e-07, + "loss": 0.6223228, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.702232837677002 + }, + { + "auxiliary_loss_clip": 0.01150502, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00190866, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 1.9619085547151685, + "language_loss": 0.72770172, + "learning_rate": 3.254820804029075e-07, + "loss": 0.75024539, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 3.9436354637145996 + }, + { + "auxiliary_loss_clip": 0.0114599, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00058198, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 3.3963593866078226, + "language_loss": 0.74676454, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76926881, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.5192642211914062 + }, + { + "auxiliary_loss_clip": 0.01160663, + "auxiliary_loss_mlp": 0.01080064, + "balance_loss_clip": 1.00122285, + "balance_loss_mlp": 0.99995512, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7432223645794843, + "language_loss": 0.54044956, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56285685, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.2185938358306885 + }, + { + "auxiliary_loss_clip": 0.01104153, + "auxiliary_loss_mlp": 0.01103775, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.0004921, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 1.9329418102380718, + "language_loss": 0.65704489, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67912418, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.6200788021087646 + }, + { + "auxiliary_loss_clip": 0.01133348, + "auxiliary_loss_mlp": 0.01102619, + "balance_loss_clip": 1.00181639, + "balance_loss_mlp": 1.00048089, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.4734914407463218, + "language_loss": 0.75021845, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77257812, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.627295970916748 + }, + { + "auxiliary_loss_clip": 0.01148772, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00041509, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 1.9528057867605195, + "language_loss": 0.65256619, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67152858, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.660179376602173 + }, + { + "auxiliary_loss_clip": 0.01098116, + "auxiliary_loss_mlp": 0.01103062, + "balance_loss_clip": 1.00150287, + "balance_loss_mlp": 1.00044656, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.796776920027439, + "language_loss": 0.77050781, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79251951, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.6680331230163574 + }, + { + "auxiliary_loss_clip": 0.0111493, + "auxiliary_loss_mlp": 0.01103191, + "balance_loss_clip": 1.00167835, + "balance_loss_mlp": 1.00057566, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.8311079270650128, + "language_loss": 0.77535182, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79753298, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.584498643875122 + }, + { + "auxiliary_loss_clip": 0.01098165, + "auxiliary_loss_mlp": 0.01103503, + "balance_loss_clip": 1.00165713, + "balance_loss_mlp": 1.00050616, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.0364338599173797, + "language_loss": 0.73174298, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75375968, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.6559059619903564 + }, + { + "auxiliary_loss_clip": 0.01148434, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_clip": 1.00180829, + "balance_loss_mlp": 1.0004369, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.658065474142699, + "language_loss": 0.78810263, + "learning_rate": 3.235680111625161e-07, + "loss": 0.8106128, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.519430637359619 + }, + { + "auxiliary_loss_clip": 0.01148171, + "auxiliary_loss_mlp": 0.01104999, + "balance_loss_clip": 1.00191319, + "balance_loss_mlp": 1.00066733, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 2.1759692043842325, + "language_loss": 0.74797976, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77051139, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.5768532752990723 + }, + { + "auxiliary_loss_clip": 0.01148609, + "auxiliary_loss_mlp": 0.01105043, + "balance_loss_clip": 1.00184691, + "balance_loss_mlp": 1.00061548, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 4.568077675412054, + "language_loss": 0.76458669, + "learning_rate": 3.23143361510728e-07, + "loss": 0.7871232, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.542968273162842 + }, + { + "auxiliary_loss_clip": 0.01100345, + "auxiliary_loss_mlp": 0.01103598, + "balance_loss_clip": 1.00141525, + "balance_loss_mlp": 1.0004108, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 1.9781596701609128, + "language_loss": 0.74541909, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76745844, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.6089212894439697 + }, + { + "auxiliary_loss_clip": 0.01132223, + "auxiliary_loss_mlp": 0.01103401, + "balance_loss_clip": 1.00177968, + "balance_loss_mlp": 1.00059509, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.9219723900362338, + "language_loss": 0.79681396, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81917024, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 3.9794068336486816 + }, + { + "auxiliary_loss_clip": 0.01133593, + "auxiliary_loss_mlp": 0.01103332, + "balance_loss_clip": 1.00180972, + "balance_loss_mlp": 1.00062132, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 1.7064916399880625, + "language_loss": 0.6986897, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72105896, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 4.065158128738403 + }, + { + "auxiliary_loss_clip": 0.01150596, + "auxiliary_loss_mlp": 0.0110288, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00064611, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.6145980483009912, + "language_loss": 0.74276412, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.7652989, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 4.012284994125366 + }, + { + "auxiliary_loss_clip": 0.01131992, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00058126, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 3.448658344156307, + "language_loss": 0.80286467, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82521659, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.566488742828369 + }, + { + "auxiliary_loss_clip": 0.01150742, + "auxiliary_loss_mlp": 0.01104378, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00071335, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7604400037847103, + "language_loss": 0.69843447, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72098565, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.512303352355957 + }, + { + "auxiliary_loss_clip": 0.01164928, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00036478, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.5980867327626642, + "language_loss": 0.71817821, + "learning_rate": 3.216590911288133e-07, + "loss": 0.74085826, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 2.570647954940796 + }, + { + "auxiliary_loss_clip": 0.0113319, + "auxiliary_loss_mlp": 0.01102984, + "balance_loss_clip": 1.0017091, + "balance_loss_mlp": 1.00036907, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9059705266038824, + "language_loss": 0.69628274, + "learning_rate": 3.214473070099564e-07, + "loss": 0.71864444, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.561427116394043 + }, + { + "auxiliary_loss_clip": 0.01116075, + "auxiliary_loss_mlp": 0.01104274, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.0006094, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.6095686855513507, + "language_loss": 0.59704828, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61925179, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.6452853679656982 + }, + { + "auxiliary_loss_clip": 0.01134015, + "auxiliary_loss_mlp": 0.01104155, + "balance_loss_clip": 1.00176919, + "balance_loss_mlp": 1.0006814, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.5638824157506583, + "language_loss": 0.6961875, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71856916, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 2.733242988586426 + }, + { + "auxiliary_loss_clip": 0.01165099, + "auxiliary_loss_mlp": 0.01104547, + "balance_loss_clip": 1.00186169, + "balance_loss_mlp": 1.00059605, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.7246408216816966, + "language_loss": 0.79195857, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81465507, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.5071604251861572 + }, + { + "auxiliary_loss_clip": 0.0116505, + "auxiliary_loss_mlp": 0.01102507, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00055885, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.8563609358717934, + "language_loss": 0.863433, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88610852, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.521329641342163 + }, + { + "auxiliary_loss_clip": 0.01164921, + "auxiliary_loss_mlp": 0.01101995, + "balance_loss_clip": 1.00186265, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.8804994231690961, + "language_loss": 0.79839545, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82106459, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.544051170349121 + }, + { + "auxiliary_loss_clip": 0.01117285, + "auxiliary_loss_mlp": 0.01103139, + "balance_loss_clip": 1.00167155, + "balance_loss_mlp": 1.00052369, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.6964230571655357, + "language_loss": 0.68988299, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71208727, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.6050286293029785 + }, + { + "auxiliary_loss_clip": 0.0111504, + "auxiliary_loss_mlp": 0.01103471, + "balance_loss_clip": 1.00148213, + "balance_loss_mlp": 1.00056922, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 2.0421758756812296, + "language_loss": 0.78352153, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80570662, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.6048154830932617 + }, + { + "auxiliary_loss_clip": 0.01148452, + "auxiliary_loss_mlp": 0.01103261, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00045443, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 2.5401051450388596, + "language_loss": 0.726807, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74932408, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.637781858444214 + }, + { + "auxiliary_loss_clip": 0.01165109, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00030303, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.601851044650502, + "language_loss": 0.7321974, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75132167, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.545590877532959 + }, + { + "auxiliary_loss_clip": 0.0115052, + "auxiliary_loss_mlp": 0.01103045, + "balance_loss_clip": 1.00181258, + "balance_loss_mlp": 1.00052524, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 1.725843383397814, + "language_loss": 0.68794584, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71048146, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.5456626415252686 + }, + { + "auxiliary_loss_clip": 0.01104871, + "auxiliary_loss_mlp": 0.0110361, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00051761, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.5971939401626574, + "language_loss": 0.85490578, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87699056, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.6628663539886475 + }, + { + "auxiliary_loss_clip": 0.01150431, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00048625, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.8178290608588676, + "language_loss": 0.7732017, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79573989, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.5434131622314453 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.01103162, + "balance_loss_clip": 1.00185418, + "balance_loss_mlp": 1.00045085, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.563603109028031, + "language_loss": 0.71560937, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73812556, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.547708749771118 + }, + { + "auxiliary_loss_clip": 0.01118946, + "auxiliary_loss_mlp": 0.0110273, + "balance_loss_clip": 1.00169861, + "balance_loss_mlp": 1.00068712, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3301869787667602, + "language_loss": 0.83725142, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85946822, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.662055253982544 + }, + { + "auxiliary_loss_clip": 0.01083186, + "auxiliary_loss_mlp": 0.01103613, + "balance_loss_clip": 1.00141907, + "balance_loss_mlp": 1.0004251, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.6662202679736244, + "language_loss": 0.77055383, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79242182, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.702894449234009 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01103019, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00049877, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.8673089366581208, + "language_loss": 0.81065935, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83302128, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.574192523956299 + }, + { + "auxiliary_loss_clip": 0.01144071, + "auxiliary_loss_mlp": 0.01080079, + "balance_loss_clip": 1.00116038, + "balance_loss_mlp": 0.99997026, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7405311862408186, + "language_loss": 0.63856584, + "learning_rate": 3.178567221188393e-07, + "loss": 0.66080731, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.2045984268188477 + }, + { + "auxiliary_loss_clip": 0.01114784, + "auxiliary_loss_mlp": 0.01102052, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00029445, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.8225896328224722, + "language_loss": 0.7302556, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.752424, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 2.595224618911743 + }, + { + "auxiliary_loss_clip": 0.01120942, + "auxiliary_loss_mlp": 0.01103889, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00051069, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.8472783454757036, + "language_loss": 0.71968782, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74193621, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 4.023609161376953 + }, + { + "auxiliary_loss_clip": 0.01133478, + "auxiliary_loss_mlp": 0.01102536, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00049269, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 1.9668488702126745, + "language_loss": 0.81780607, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84016621, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.5758743286132812 + }, + { + "auxiliary_loss_clip": 0.01131553, + "auxiliary_loss_mlp": 0.01103363, + "balance_loss_clip": 1.00167489, + "balance_loss_mlp": 1.00055718, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.5873397499217237, + "language_loss": 0.73055804, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75290716, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.6013617515563965 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.0110359, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00049806, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.62961009313922, + "language_loss": 0.69241846, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71493804, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.553938627243042 + }, + { + "auxiliary_loss_clip": 0.01100535, + "auxiliary_loss_mlp": 0.01102807, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00047803, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.7137604578024832, + "language_loss": 0.74588835, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76792181, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.6633291244506836 + }, + { + "auxiliary_loss_clip": 0.01165301, + "auxiliary_loss_mlp": 0.01105017, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00068533, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.8412099502593071, + "language_loss": 0.69517374, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71787697, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.5262396335601807 + }, + { + "auxiliary_loss_clip": 0.01165056, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.00189602, + "balance_loss_mlp": 1.00054264, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.7759242251003629, + "language_loss": 0.64256704, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66524637, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.5540268421173096 + }, + { + "auxiliary_loss_clip": 0.01165158, + "auxiliary_loss_mlp": 0.01103786, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.00050259, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.7187702139540018, + "language_loss": 0.69075286, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71344233, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.497048854827881 + }, + { + "auxiliary_loss_clip": 0.01134049, + "auxiliary_loss_mlp": 0.01104197, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00053215, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.7760264467872695, + "language_loss": 0.69414186, + "learning_rate": 3.157532220876475e-07, + "loss": 0.7165243, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.5470619201660156 + }, + { + "auxiliary_loss_clip": 0.0111796, + "auxiliary_loss_mlp": 0.01103763, + "balance_loss_clip": 1.00167656, + "balance_loss_mlp": 1.0004797, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 2.251252941841369, + "language_loss": 0.78819865, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81041586, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.6449215412139893 + }, + { + "auxiliary_loss_clip": 0.01150734, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00044799, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 3.7139787800304926, + "language_loss": 0.68878806, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.71132982, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.5150959491729736 + }, + { + "auxiliary_loss_clip": 0.01104513, + "auxiliary_loss_mlp": 0.01103303, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00049651, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 2.364509170025333, + "language_loss": 0.82152939, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84360754, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.6993720531463623 + }, + { + "auxiliary_loss_clip": 0.01147869, + "auxiliary_loss_mlp": 0.0110303, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00050974, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 3.1126795721727714, + "language_loss": 0.78503489, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80754381, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 2.538621187210083 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.0016973, + "balance_loss_mlp": 1.00046623, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.6713169038225866, + "language_loss": 0.65539843, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67762637, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 5.474136829376221 + }, + { + "auxiliary_loss_clip": 0.0114579, + "auxiliary_loss_mlp": 0.01102846, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.0006125, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 2.7420780219699177, + "language_loss": 0.7422232, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76470959, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 2.574159860610962 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.0110344, + "balance_loss_clip": 1.00176156, + "balance_loss_mlp": 1.00034785, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 1.6943498040790481, + "language_loss": 0.81333435, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83585286, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 3.9007315635681152 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.0110309, + "balance_loss_clip": 1.0019275, + "balance_loss_mlp": 1.00057006, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.678039738797145, + "language_loss": 0.66491306, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68742907, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 2.6468167304992676 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01103401, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00049949, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.8929585517785945, + "language_loss": 0.74817914, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77039397, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.61741304397583 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01079692, + "balance_loss_clip": 1.00127089, + "balance_loss_mlp": 0.99996519, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7304534362613266, + "language_loss": 0.58990437, + "learning_rate": 3.136561087351175e-07, + "loss": 0.6117875, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.3045432567596436 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.0074721, + "balance_loss_clip": 1.00195277, + "balance_loss_mlp": 1.00030303, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 2.3687137668750524, + "language_loss": 0.7959348, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.8149116, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.5170950889587402 + }, + { + "auxiliary_loss_clip": 0.01145921, + "auxiliary_loss_mlp": 0.01102574, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00053048, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.8457667064319512, + "language_loss": 0.69007981, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71256471, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.497349262237549 + }, + { + "auxiliary_loss_clip": 0.01133612, + "auxiliary_loss_mlp": 0.0110395, + "balance_loss_clip": 1.00158381, + "balance_loss_mlp": 1.00047588, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.8654019305378067, + "language_loss": 0.69322824, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71560395, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.534867763519287 + }, + { + "auxiliary_loss_clip": 0.01064003, + "auxiliary_loss_mlp": 0.01103839, + "balance_loss_clip": 1.00172377, + "balance_loss_mlp": 1.00046086, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.5385947884578084, + "language_loss": 0.75952828, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78120673, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 2.9453787803649902 + }, + { + "auxiliary_loss_clip": 0.01086549, + "auxiliary_loss_mlp": 0.01102599, + "balance_loss_clip": 1.00160217, + "balance_loss_mlp": 1.00026977, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.834432758310817, + "language_loss": 0.78033686, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80222833, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 3.031963586807251 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.01102347, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.6193169659842086, + "language_loss": 0.6239801, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64665401, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.558185577392578 + }, + { + "auxiliary_loss_clip": 0.01165117, + "auxiliary_loss_mlp": 0.01103245, + "balance_loss_clip": 1.0018568, + "balance_loss_mlp": 1.00062919, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.4076344742936626, + "language_loss": 0.74193227, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76461589, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.51017689704895 + }, + { + "auxiliary_loss_clip": 0.01117253, + "auxiliary_loss_mlp": 0.01104086, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00061202, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.8694001726309324, + "language_loss": 0.63917983, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66139328, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.674682855606079 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.00187957, + "balance_loss_mlp": 1.00046051, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.5750495077608506, + "language_loss": 0.82039845, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84276688, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.600994348526001 + }, + { + "auxiliary_loss_clip": 0.01150416, + "auxiliary_loss_mlp": 0.01102761, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00052702, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.8872977165183054, + "language_loss": 0.7044819, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72701371, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.616225242614746 + }, + { + "auxiliary_loss_clip": 0.01150087, + "auxiliary_loss_mlp": 0.01103989, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00061107, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.979442273900549, + "language_loss": 0.62783551, + "learning_rate": 3.113566701515036e-07, + "loss": 0.65037632, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.5531723499298096 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01104316, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00065148, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.7347453729182756, + "language_loss": 0.71116042, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73354107, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.6230525970458984 + }, + { + "auxiliary_loss_clip": 0.01130235, + "auxiliary_loss_mlp": 0.01080502, + "balance_loss_clip": 1.00121069, + "balance_loss_mlp": 1.0000124, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.9085234912319401, + "language_loss": 0.62684619, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64895356, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 2.9994146823883057 + }, + { + "auxiliary_loss_clip": 0.01099807, + "auxiliary_loss_mlp": 0.01102975, + "balance_loss_clip": 1.00165915, + "balance_loss_mlp": 1.0006454, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.1087804646639157, + "language_loss": 0.631001, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65302885, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.8675615787506104 + }, + { + "auxiliary_loss_clip": 0.01120887, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00170982, + "balance_loss_mlp": 1.00035095, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 1.9559512295758623, + "language_loss": 0.69905865, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71774149, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.6149280071258545 + }, + { + "auxiliary_loss_clip": 0.01148543, + "auxiliary_loss_mlp": 0.01104065, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00059164, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.1825833117371203, + "language_loss": 0.70993364, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73245972, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.4922733306884766 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01102764, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.00043476, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.407545451624987, + "language_loss": 0.82534266, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84802032, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.542018175125122 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01102543, + "balance_loss_clip": 1.00181496, + "balance_loss_mlp": 1.00050032, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.7250163398571785, + "language_loss": 0.83206916, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85443211, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.638942003250122 + }, + { + "auxiliary_loss_clip": 0.01150359, + "auxiliary_loss_mlp": 0.01102915, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00058603, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.790089341356367, + "language_loss": 0.70656848, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72910118, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 4.001631498336792 + }, + { + "auxiliary_loss_clip": 0.01144612, + "auxiliary_loss_mlp": 0.01080108, + "balance_loss_clip": 1.00123453, + "balance_loss_mlp": 0.99999905, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8570149021150758, + "language_loss": 0.67951369, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70176095, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.117440938949585 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01104376, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00052142, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.8881319181371885, + "language_loss": 0.69518507, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71756637, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.5647804737091064 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01105588, + "balance_loss_clip": 1.00177705, + "balance_loss_mlp": 1.00049317, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 1.819536324765494, + "language_loss": 0.63203549, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65442514, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.715085983276367 + }, + { + "auxiliary_loss_clip": 0.01129223, + "auxiliary_loss_mlp": 0.01080121, + "balance_loss_clip": 1.0010643, + "balance_loss_mlp": 1.00001264, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8282549495735325, + "language_loss": 0.59368432, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61577785, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.1721444129943848 + }, + { + "auxiliary_loss_clip": 0.01165328, + "auxiliary_loss_mlp": 0.01104778, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00054097, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.808666646804511, + "language_loss": 0.75365639, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77635747, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.5053982734680176 + }, + { + "auxiliary_loss_clip": 0.0114841, + "auxiliary_loss_mlp": 0.01103178, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00056231, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.7582826886438152, + "language_loss": 0.6227237, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64523959, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.628472089767456 + }, + { + "auxiliary_loss_clip": 0.01102767, + "auxiliary_loss_mlp": 0.01104993, + "balance_loss_clip": 1.00156021, + "balance_loss_mlp": 1.00047004, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 3.0141737517623093, + "language_loss": 0.66151857, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68359619, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.6183829307556152 + }, + { + "auxiliary_loss_clip": 0.01135445, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00044608, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.9036627218199935, + "language_loss": 0.66429383, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68668652, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.572538137435913 + }, + { + "auxiliary_loss_clip": 0.01116475, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00055575, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.6055272855421623, + "language_loss": 0.75246644, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77466667, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.6264874935150146 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.00747396, + "balance_loss_clip": 1.00181186, + "balance_loss_mlp": 1.00042272, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.7827143603559445, + "language_loss": 0.78759426, + "learning_rate": 3.076106700253709e-07, + "loss": 0.80640048, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.566053867340088 + }, + { + "auxiliary_loss_clip": 0.01148522, + "auxiliary_loss_mlp": 0.0110459, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00063932, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.8156435873297518, + "language_loss": 0.68479824, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70732939, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.5262808799743652 + }, + { + "auxiliary_loss_clip": 0.01150197, + "auxiliary_loss_mlp": 0.01103529, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00043726, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 1.8444155366221644, + "language_loss": 0.75488353, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77742076, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.521657705307007 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01102972, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00064254, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.658318558608893, + "language_loss": 0.63699794, + "learning_rate": 3.069883569603102e-07, + "loss": 0.6591754, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 4.321954965591431 + }, + { + "auxiliary_loss_clip": 0.01133523, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_clip": 1.00168061, + "balance_loss_mlp": 1.00045061, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.548332367874417, + "language_loss": 0.73428649, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75665426, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 4.252253770828247 + }, + { + "auxiliary_loss_clip": 0.01148601, + "auxiliary_loss_mlp": 0.01103702, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00061035, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.9045240968252612, + "language_loss": 0.66082388, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68334687, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 2.6262192726135254 + }, + { + "auxiliary_loss_clip": 0.01133626, + "auxiliary_loss_mlp": 0.01103546, + "balance_loss_clip": 1.00175679, + "balance_loss_mlp": 1.00054884, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.6740535334242224, + "language_loss": 0.60855287, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.63092458, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 4.17671799659729 + }, + { + "auxiliary_loss_clip": 0.01143924, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_clip": 1.00124335, + "balance_loss_mlp": 0.99998915, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7735448715401299, + "language_loss": 0.57382154, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59606183, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.213867425918579 + }, + { + "auxiliary_loss_clip": 0.01093734, + "auxiliary_loss_mlp": 0.00746567, + "balance_loss_clip": 1.00113058, + "balance_loss_mlp": 1.00117517, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.7038509144821529, + "language_loss": 0.54952848, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56793147, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 3.320927381515503 + }, + { + "auxiliary_loss_clip": 0.01118624, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00057054, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 2.0014281974673302, + "language_loss": 0.69482774, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71704113, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 2.63897967338562 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01102037, + "balance_loss_clip": 1.00167656, + "balance_loss_mlp": 1.00056648, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.8912668212419084, + "language_loss": 0.69868493, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72070754, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 2.640455484390259 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00065029, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.6755588381558044, + "language_loss": 0.72179359, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74431628, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.5488929748535156 + }, + { + "auxiliary_loss_clip": 0.01148659, + "auxiliary_loss_mlp": 0.01104579, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00053287, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.0139217771268028, + "language_loss": 0.69050384, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71303618, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 2.5056726932525635 + }, + { + "auxiliary_loss_clip": 0.01133046, + "auxiliary_loss_mlp": 0.01102337, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00048459, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.6435029935980607, + "language_loss": 0.69547039, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71782422, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 2.6069440841674805 + }, + { + "auxiliary_loss_clip": 0.01132847, + "auxiliary_loss_mlp": 0.01103373, + "balance_loss_clip": 1.00171614, + "balance_loss_mlp": 1.00047207, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.6024957365252166, + "language_loss": 0.7120856, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73444784, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.570460557937622 + }, + { + "auxiliary_loss_clip": 0.0111919, + "auxiliary_loss_mlp": 0.01103382, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00057602, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.8617086665041542, + "language_loss": 0.77124357, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79346937, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.6398942470550537 + }, + { + "auxiliary_loss_clip": 0.01115947, + "auxiliary_loss_mlp": 0.01101882, + "balance_loss_clip": 1.00168276, + "balance_loss_mlp": 1.00050604, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.845777081930477, + "language_loss": 0.69622052, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71839881, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.6051206588745117 + }, + { + "auxiliary_loss_clip": 0.01100539, + "auxiliary_loss_mlp": 0.01102455, + "balance_loss_clip": 1.0016036, + "balance_loss_mlp": 1.00050688, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.7076847029821907, + "language_loss": 0.69990909, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72193909, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.6346068382263184 + }, + { + "auxiliary_loss_clip": 0.01116271, + "auxiliary_loss_mlp": 0.01080476, + "balance_loss_clip": 1.00113153, + "balance_loss_mlp": 0.99998617, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8420320297044059, + "language_loss": 0.65143847, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67340589, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.250422239303589 + }, + { + "auxiliary_loss_clip": 0.01148416, + "auxiliary_loss_mlp": 0.01104096, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00062227, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 1.8655577758286197, + "language_loss": 0.77806413, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.8005892, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.520122528076172 + }, + { + "auxiliary_loss_clip": 0.01085582, + "auxiliary_loss_mlp": 0.01103649, + "balance_loss_clip": 1.00162363, + "balance_loss_mlp": 1.0003655, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.5913669874655059, + "language_loss": 0.62329626, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64518857, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.7697441577911377 + }, + { + "auxiliary_loss_clip": 0.01117066, + "auxiliary_loss_mlp": 0.01102834, + "balance_loss_clip": 1.0017854, + "balance_loss_mlp": 1.00050497, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.5914537413428438, + "language_loss": 0.82512844, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84732741, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.6242830753326416 + }, + { + "auxiliary_loss_clip": 0.01132208, + "auxiliary_loss_mlp": 0.01104601, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00065088, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 8.428160178142573, + "language_loss": 0.69128054, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71364862, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.621717929840088 + }, + { + "auxiliary_loss_clip": 0.01085351, + "auxiliary_loss_mlp": 0.01102098, + "balance_loss_clip": 1.00160503, + "balance_loss_mlp": 1.00053144, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 3.2267202993214825, + "language_loss": 0.74104786, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76292229, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.731158494949341 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01103183, + "balance_loss_clip": 1.00168681, + "balance_loss_mlp": 1.00047219, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.7473601313744997, + "language_loss": 0.74355215, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76590008, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 2.706749200820923 + }, + { + "auxiliary_loss_clip": 0.01116906, + "auxiliary_loss_mlp": 0.01103048, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00043285, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.5882810121277289, + "language_loss": 0.75844681, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.78064632, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.619940996170044 + }, + { + "auxiliary_loss_clip": 0.01165037, + "auxiliary_loss_mlp": 0.01103315, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.0007, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.9854193169906966, + "language_loss": 0.72162867, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74431223, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.6209864616394043 + }, + { + "auxiliary_loss_clip": 0.01132727, + "auxiliary_loss_mlp": 0.0110267, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.00053191, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.527966203825235, + "language_loss": 0.74713933, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76949328, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 2.622340202331543 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01103355, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00064445, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.6612703075145738, + "language_loss": 0.75784868, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77988613, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 4.1324920654296875 + }, + { + "auxiliary_loss_clip": 0.01115129, + "auxiliary_loss_mlp": 0.00747279, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00029135, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.6915447153324032, + "language_loss": 0.75176859, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77039272, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 2.6233363151550293 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01104428, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00047719, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 1.9144873566437945, + "language_loss": 0.73633951, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75903648, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.553114891052246 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_clip": 1.00165296, + "balance_loss_mlp": 1.00044811, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 5.571950417319724, + "language_loss": 0.77479613, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79686761, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.616475820541382 + }, + { + "auxiliary_loss_clip": 0.0114831, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00044525, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.5516199553418866, + "language_loss": 0.82558036, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84809214, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.5550575256347656 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.01102618, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00038445, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8008795746635815, + "language_loss": 0.74511516, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76764387, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.5071308612823486 + }, + { + "auxiliary_loss_clip": 0.01114999, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_clip": 1.00120282, + "balance_loss_mlp": 0.99994725, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7978825008143261, + "language_loss": 0.56692994, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58888429, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.195941925048828 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01103599, + "balance_loss_clip": 1.00173008, + "balance_loss_mlp": 1.00041115, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.8532815796335866, + "language_loss": 0.79738039, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.81958771, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.6501951217651367 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_clip": 1.0017879, + "balance_loss_mlp": 1.00058711, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 1.8755960450868363, + "language_loss": 0.75135332, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77342016, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.65887451171875 + }, + { + "auxiliary_loss_clip": 0.01148294, + "auxiliary_loss_mlp": 0.01102742, + "balance_loss_clip": 1.00162327, + "balance_loss_mlp": 1.00041258, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.6835944284674178, + "language_loss": 0.75956893, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78207934, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.5695769786834717 + }, + { + "auxiliary_loss_clip": 0.01165026, + "auxiliary_loss_mlp": 0.01103797, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00041842, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6636432845485947, + "language_loss": 0.73440909, + "learning_rate": 2.997707859351304e-07, + "loss": 0.7570973, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.5003104209899902 + }, + { + "auxiliary_loss_clip": 0.01148409, + "auxiliary_loss_mlp": 0.01104357, + "balance_loss_clip": 1.00165498, + "balance_loss_mlp": 1.00059748, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.5601759766252254, + "language_loss": 0.69895619, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72148383, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.632831573486328 + }, + { + "auxiliary_loss_clip": 0.01131835, + "auxiliary_loss_mlp": 0.01103325, + "balance_loss_clip": 1.00164795, + "balance_loss_mlp": 1.00051856, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.4007417474971786, + "language_loss": 0.6833564, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70570803, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.6602702140808105 + }, + { + "auxiliary_loss_clip": 0.01099512, + "auxiliary_loss_mlp": 0.0110391, + "balance_loss_clip": 1.00158441, + "balance_loss_mlp": 1.0006268, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.6002362035057074, + "language_loss": 0.76905704, + "learning_rate": 2.991558072017426e-07, + "loss": 0.7910912, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 5.556048631668091 + }, + { + "auxiliary_loss_clip": 0.01145652, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00060415, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 2.39785826942617, + "language_loss": 0.80313826, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82562697, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.5213096141815186 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01102558, + "balance_loss_clip": 1.00178254, + "balance_loss_mlp": 1.00060987, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.4736090058744737, + "language_loss": 0.7112695, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73360956, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 4.123301029205322 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_clip": 1.00151932, + "balance_loss_mlp": 1.00046027, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 1.6922048774052263, + "language_loss": 0.68089902, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7031188, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 2.7567052841186523 + }, + { + "auxiliary_loss_clip": 0.01148259, + "auxiliary_loss_mlp": 0.01103177, + "balance_loss_clip": 1.00173283, + "balance_loss_mlp": 1.00046623, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 4.424872893590281, + "language_loss": 0.7740038, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79651821, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.549165725708008 + }, + { + "auxiliary_loss_clip": 0.01133726, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00044107, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.4671015131288452, + "language_loss": 0.70000178, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72237343, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 2.593022108078003 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.0110381, + "balance_loss_clip": 1.00178528, + "balance_loss_mlp": 1.00052714, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.4859992451800648, + "language_loss": 0.65179813, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67415285, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.6263537406921387 + }, + { + "auxiliary_loss_clip": 0.01086019, + "auxiliary_loss_mlp": 0.01103096, + "balance_loss_clip": 1.00157547, + "balance_loss_mlp": 1.00048101, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.7893513273636246, + "language_loss": 0.66611499, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68800616, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.701869010925293 + }, + { + "auxiliary_loss_clip": 0.01165083, + "auxiliary_loss_mlp": 0.01103748, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.00065601, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.8446908312711996, + "language_loss": 0.6643489, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68703723, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.5256948471069336 + }, + { + "auxiliary_loss_clip": 0.01052994, + "auxiliary_loss_mlp": 0.01102491, + "balance_loss_clip": 1.00157082, + "balance_loss_mlp": 1.00044763, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.6360052981677295, + "language_loss": 0.66446126, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68601614, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 2.869091272354126 + }, + { + "auxiliary_loss_clip": 0.01104003, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_clip": 1.00164354, + "balance_loss_mlp": 1.0005914, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.7398790396489094, + "language_loss": 0.72066867, + "learning_rate": 2.971100715196666e-07, + "loss": 0.74273503, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 2.6891438961029053 + }, + { + "auxiliary_loss_clip": 0.01068841, + "auxiliary_loss_mlp": 0.01103351, + "balance_loss_clip": 1.0015645, + "balance_loss_mlp": 1.00054455, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.6886134786273885, + "language_loss": 0.71919304, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74091494, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.735053539276123 + }, + { + "auxiliary_loss_clip": 0.01120886, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00047493, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.5319921455564647, + "language_loss": 0.7600134, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78224933, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.6267354488372803 + }, + { + "auxiliary_loss_clip": 0.01164969, + "auxiliary_loss_mlp": 0.01103969, + "balance_loss_clip": 1.00186694, + "balance_loss_mlp": 1.00059032, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.8743805289988622, + "language_loss": 0.67953372, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.70222306, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.4790518283843994 + }, + { + "auxiliary_loss_clip": 0.01101851, + "auxiliary_loss_mlp": 0.01104368, + "balance_loss_clip": 1.0016408, + "balance_loss_mlp": 1.00060821, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.7659445235296478, + "language_loss": 0.74479938, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76686156, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.665865421295166 + }, + { + "auxiliary_loss_clip": 0.01101835, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_clip": 1.00173783, + "balance_loss_mlp": 1.00044668, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.5028828687361095, + "language_loss": 0.73503423, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75708032, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.6513309478759766 + }, + { + "auxiliary_loss_clip": 0.01135684, + "auxiliary_loss_mlp": 0.01103581, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00048923, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.4050100379029495, + "language_loss": 0.74364978, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76604247, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 2.6038637161254883 + }, + { + "auxiliary_loss_clip": 0.01148525, + "auxiliary_loss_mlp": 0.01102777, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00054336, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.6962889517476842, + "language_loss": 0.79303169, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81554472, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.542181968688965 + }, + { + "auxiliary_loss_clip": 0.0116516, + "auxiliary_loss_mlp": 0.01102762, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00052786, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.651063904249305, + "language_loss": 0.73050076, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75318003, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.586526870727539 + }, + { + "auxiliary_loss_clip": 0.0114874, + "auxiliary_loss_mlp": 0.00747283, + "balance_loss_clip": 1.00182247, + "balance_loss_mlp": 1.00032711, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 2.27145699874292, + "language_loss": 0.77583182, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79479206, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.53658127784729 + }, + { + "auxiliary_loss_clip": 0.01148497, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00047028, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.5669749656061855, + "language_loss": 0.63169134, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65420997, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.5323188304901123 + }, + { + "auxiliary_loss_clip": 0.01148533, + "auxiliary_loss_mlp": 0.01104005, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00043631, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.672991150580156, + "language_loss": 0.72959352, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75211895, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.551523447036743 + }, + { + "auxiliary_loss_clip": 0.01135059, + "auxiliary_loss_mlp": 0.0110481, + "balance_loss_clip": 1.00194383, + "balance_loss_mlp": 1.00066853, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.747327324450273, + "language_loss": 0.66718626, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68958491, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.6372387409210205 + }, + { + "auxiliary_loss_clip": 0.01165206, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.0003823, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.8915113596741346, + "language_loss": 0.74088347, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76357126, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.4809465408325195 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01102118, + "balance_loss_clip": 1.00166345, + "balance_loss_mlp": 1.00055194, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5155245891015248, + "language_loss": 0.80787981, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83006513, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.644150495529175 + }, + { + "auxiliary_loss_clip": 0.01118804, + "auxiliary_loss_mlp": 0.01103402, + "balance_loss_clip": 1.00180209, + "balance_loss_mlp": 1.00069118, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.7923568357889854, + "language_loss": 0.7347585, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.7569806, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 4.007370948791504 + }, + { + "auxiliary_loss_clip": 0.01131754, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_clip": 1.00165892, + "balance_loss_mlp": 1.00055051, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.6509203874999465, + "language_loss": 0.78177238, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80411685, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.6029903888702393 + }, + { + "auxiliary_loss_clip": 0.01084746, + "auxiliary_loss_mlp": 0.00747479, + "balance_loss_clip": 1.00167537, + "balance_loss_mlp": 1.00040591, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.3245548640437654, + "language_loss": 0.70682919, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72515142, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.703125 + }, + { + "auxiliary_loss_clip": 0.01165349, + "auxiliary_loss_mlp": 0.01105136, + "balance_loss_clip": 1.00190461, + "balance_loss_mlp": 1.00061297, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 1.9072316572376216, + "language_loss": 0.68023884, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70294374, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.4742703437805176 + }, + { + "auxiliary_loss_clip": 0.01148483, + "auxiliary_loss_mlp": 0.01103228, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00042212, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 3.0466991901621565, + "language_loss": 0.76370233, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.78621948, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.5470802783966064 + }, + { + "auxiliary_loss_clip": 0.01116985, + "auxiliary_loss_mlp": 0.01102139, + "balance_loss_clip": 1.00160289, + "balance_loss_mlp": 1.00047743, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 2.3476284376156125, + "language_loss": 0.81473339, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83692461, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.632230758666992 + }, + { + "auxiliary_loss_clip": 0.01148469, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00054324, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.4748251511240342, + "language_loss": 0.78218424, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80470335, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.543738603591919 + }, + { + "auxiliary_loss_clip": 0.01134859, + "auxiliary_loss_mlp": 0.01102998, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 1.00047827, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 2.5058792154402383, + "language_loss": 0.8203938, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84277231, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.5726284980773926 + }, + { + "auxiliary_loss_clip": 0.01143829, + "auxiliary_loss_mlp": 0.01080078, + "balance_loss_clip": 1.00116253, + "balance_loss_mlp": 0.99996978, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7592152397031037, + "language_loss": 0.56224966, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58448875, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.140216588973999 + }, + { + "auxiliary_loss_clip": 0.01150323, + "auxiliary_loss_mlp": 0.0110193, + "balance_loss_clip": 1.00182033, + "balance_loss_mlp": 1.00045872, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.8178832359231383, + "language_loss": 0.68748081, + "learning_rate": 2.922266666860831e-07, + "loss": 0.71000338, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.4994359016418457 + }, + { + "auxiliary_loss_clip": 0.01072447, + "auxiliary_loss_mlp": 0.01103164, + "balance_loss_clip": 1.00159001, + "balance_loss_mlp": 1.0005486, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 2.817960879763327, + "language_loss": 0.69013822, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71189439, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.763744831085205 + }, + { + "auxiliary_loss_clip": 0.0109797, + "auxiliary_loss_mlp": 0.011025, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00064754, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.7918654659410058, + "language_loss": 0.61922967, + "learning_rate": 2.918213985472631e-07, + "loss": 0.6412344, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.7489845752716064 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01080061, + "balance_loss_clip": 1.00117779, + "balance_loss_mlp": 0.99995279, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.856666130824428, + "language_loss": 0.61937416, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64146966, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 6.13847541809082 + }, + { + "auxiliary_loss_clip": 0.01165154, + "auxiliary_loss_mlp": 0.01103617, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00052428, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.6749531033613767, + "language_loss": 0.7416048, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76429248, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.5189313888549805 + }, + { + "auxiliary_loss_clip": 0.0109743, + "auxiliary_loss_mlp": 0.00747355, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00031316, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 1.884097710554104, + "language_loss": 0.80051231, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.81896007, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.6506741046905518 + }, + { + "auxiliary_loss_clip": 0.01165113, + "auxiliary_loss_mlp": 0.01104409, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00055337, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6131056665499854, + "language_loss": 0.68005621, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70275146, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 3.987497568130493 + }, + { + "auxiliary_loss_clip": 0.01149785, + "auxiliary_loss_mlp": 0.01103162, + "balance_loss_clip": 1.00173295, + "balance_loss_mlp": 1.00064158, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.7565176812195085, + "language_loss": 0.7365467, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.75907624, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.5721752643585205 + }, + { + "auxiliary_loss_clip": 0.01120747, + "auxiliary_loss_mlp": 0.01103587, + "balance_loss_clip": 1.00173473, + "balance_loss_mlp": 1.00059021, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.54094507991883, + "language_loss": 0.67318362, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69542694, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 2.8828561305999756 + }, + { + "auxiliary_loss_clip": 0.01132089, + "auxiliary_loss_mlp": 0.01104722, + "balance_loss_clip": 1.00170326, + "balance_loss_mlp": 1.00039017, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.5000556619930143, + "language_loss": 0.82482708, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84719515, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 2.583540201187134 + }, + { + "auxiliary_loss_clip": 0.01148436, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00057924, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.729781760906237, + "language_loss": 0.74413037, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76665139, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.5206806659698486 + }, + { + "auxiliary_loss_clip": 0.01165107, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.00189555, + "balance_loss_mlp": 1.00069571, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.7234195165770123, + "language_loss": 0.71448582, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73718143, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.492595672607422 + }, + { + "auxiliary_loss_clip": 0.01134107, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.00169766, + "balance_loss_mlp": 1.0004245, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.5938079973561303, + "language_loss": 0.84583503, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86821318, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.5819785594940186 + }, + { + "auxiliary_loss_clip": 0.01132882, + "auxiliary_loss_mlp": 0.01104213, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00064397, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.4768417533464482, + "language_loss": 0.76051289, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78288376, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.626988410949707 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00040388, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 13.40937213213536, + "language_loss": 0.79452431, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81364703, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.4800686836242676 + }, + { + "auxiliary_loss_clip": 0.0114854, + "auxiliary_loss_mlp": 0.01104587, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00063705, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.8425503610624285, + "language_loss": 0.80561936, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82815063, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.5198452472686768 + }, + { + "auxiliary_loss_clip": 0.01131931, + "auxiliary_loss_mlp": 0.01102267, + "balance_loss_clip": 1.00170422, + "balance_loss_mlp": 1.00041413, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 2.7295414038209267, + "language_loss": 0.77605772, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79839969, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.535318374633789 + }, + { + "auxiliary_loss_clip": 0.01165314, + "auxiliary_loss_mlp": 0.01104145, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00048113, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 2.2402600014824943, + "language_loss": 0.83328605, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85598075, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.4881751537323 + }, + { + "auxiliary_loss_clip": 0.01135677, + "auxiliary_loss_mlp": 0.01103913, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00053501, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 1.7146075298145522, + "language_loss": 0.73960114, + "learning_rate": 2.885885860916795e-07, + "loss": 0.7619971, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 2.579163074493408 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01103736, + "balance_loss_clip": 1.00189793, + "balance_loss_mlp": 1.00045252, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.4319173024540175, + "language_loss": 0.68038917, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70293051, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.618414878845215 + }, + { + "auxiliary_loss_clip": 0.01098168, + "auxiliary_loss_mlp": 0.0110383, + "balance_loss_clip": 1.00162685, + "balance_loss_mlp": 1.00054705, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 2.160825494067626, + "language_loss": 0.79569125, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81771123, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.6126976013183594 + }, + { + "auxiliary_loss_clip": 0.01113162, + "auxiliary_loss_mlp": 0.01104174, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00050998, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.9504104537985507, + "language_loss": 0.68502426, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70719761, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.574585437774658 + }, + { + "auxiliary_loss_clip": 0.0113372, + "auxiliary_loss_mlp": 0.01103741, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.000458, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 1.8192675065493027, + "language_loss": 0.73087907, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.7532537, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.5971245765686035 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00043797, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.833178094302465, + "language_loss": 0.77516234, + "learning_rate": 2.875817378128975e-07, + "loss": 0.797369, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.5870580673217773 + }, + { + "auxiliary_loss_clip": 0.01126803, + "auxiliary_loss_mlp": 0.0107973, + "balance_loss_clip": 1.00111151, + "balance_loss_mlp": 1.0000031, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.8770744215242192, + "language_loss": 0.55264151, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57470685, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 3.097888946533203 + }, + { + "auxiliary_loss_clip": 0.01150181, + "auxiliary_loss_mlp": 0.01104765, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00071907, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.5690411262653639, + "language_loss": 0.75694603, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77949548, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.5918948650360107 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01104048, + "balance_loss_clip": 1.00172186, + "balance_loss_mlp": 1.00038385, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.5931470014752929, + "language_loss": 0.78927362, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.81133509, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.6901140213012695 + }, + { + "auxiliary_loss_clip": 0.01084646, + "auxiliary_loss_mlp": 0.01102543, + "balance_loss_clip": 1.00140202, + "balance_loss_mlp": 1.00059545, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.9473313605805485, + "language_loss": 0.74799746, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76986933, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.7156426906585693 + }, + { + "auxiliary_loss_clip": 0.01148712, + "auxiliary_loss_mlp": 0.01103287, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.704118759690693, + "language_loss": 0.63475978, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65727973, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 4.026581764221191 + }, + { + "auxiliary_loss_clip": 0.01135107, + "auxiliary_loss_mlp": 0.01103948, + "balance_loss_clip": 1.00167012, + "balance_loss_mlp": 1.00047421, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 1.9872016293912922, + "language_loss": 0.79325008, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81564063, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.5798327922821045 + }, + { + "auxiliary_loss_clip": 0.01116906, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00050914, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.5063748945004793, + "language_loss": 0.78158516, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80377972, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.6307995319366455 + }, + { + "auxiliary_loss_clip": 0.01145247, + "auxiliary_loss_mlp": 0.0108005, + "balance_loss_clip": 1.00124907, + "balance_loss_mlp": 0.99994147, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7761702360676996, + "language_loss": 0.55853164, + "learning_rate": 2.859741575868344e-07, + "loss": 0.58078462, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.116492509841919 + }, + { + "auxiliary_loss_clip": 0.0115023, + "auxiliary_loss_mlp": 0.01103096, + "balance_loss_clip": 1.0017904, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.7150261032824485, + "language_loss": 0.67322695, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69576019, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.620316982269287 + }, + { + "auxiliary_loss_clip": 0.01134143, + "auxiliary_loss_mlp": 0.01102835, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00060153, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.6166846679983828, + "language_loss": 0.78083611, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80320591, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.5956614017486572 + }, + { + "auxiliary_loss_clip": 0.01160501, + "auxiliary_loss_mlp": 0.01079687, + "balance_loss_clip": 1.0012176, + "balance_loss_mlp": 0.99995977, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7584970624862166, + "language_loss": 0.58652389, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60892582, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 2.948993682861328 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01104188, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00052333, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6848043395238612, + "language_loss": 0.71619833, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73872602, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.551320791244507 + }, + { + "auxiliary_loss_clip": 0.01150444, + "auxiliary_loss_mlp": 0.01103045, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00052488, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.8120780903508165, + "language_loss": 0.75311238, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77564728, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.5847318172454834 + }, + { + "auxiliary_loss_clip": 0.01100475, + "auxiliary_loss_mlp": 0.01102163, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00050163, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.457742307021002, + "language_loss": 0.73245722, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75448358, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.650953531265259 + }, + { + "auxiliary_loss_clip": 0.0116525, + "auxiliary_loss_mlp": 0.01104811, + "balance_loss_clip": 1.00188303, + "balance_loss_mlp": 1.00057483, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.8407100308206945, + "language_loss": 0.73173004, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75443065, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.4846575260162354 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00047231, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.5973556589707645, + "language_loss": 0.78883475, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81133848, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.5795607566833496 + }, + { + "auxiliary_loss_clip": 0.01065933, + "auxiliary_loss_mlp": 0.0110224, + "balance_loss_clip": 1.001284, + "balance_loss_mlp": 1.00048316, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.5278725060348615, + "language_loss": 0.82182956, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84351128, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 2.810035228729248 + }, + { + "auxiliary_loss_clip": 0.01165093, + "auxiliary_loss_mlp": 0.01103827, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00054395, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 2.256830428506759, + "language_loss": 0.78980792, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81249714, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 5.405772924423218 + }, + { + "auxiliary_loss_clip": 0.01150375, + "auxiliary_loss_mlp": 0.01103607, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00041986, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 2.6077332650032354, + "language_loss": 0.75614822, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77868807, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.5376486778259277 + }, + { + "auxiliary_loss_clip": 0.0109968, + "auxiliary_loss_mlp": 0.00747332, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00039911, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.485832233384299, + "language_loss": 0.74908757, + "learning_rate": 2.835705879864232e-07, + "loss": 0.76755768, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 2.7293930053710938 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.0110416, + "balance_loss_clip": 1.00166583, + "balance_loss_mlp": 1.00059068, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 2.5520922679077485, + "language_loss": 0.69050753, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71286166, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 4.021954536437988 + }, + { + "auxiliary_loss_clip": 0.01148731, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.0017575, + "balance_loss_mlp": 1.00065684, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 2.1929603209227264, + "language_loss": 0.75611603, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77864277, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 2.701622247695923 + }, + { + "auxiliary_loss_clip": 0.01128863, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.00126171, + "balance_loss_mlp": 0.9999752, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8795679464246061, + "language_loss": 0.63136232, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.6534518, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 3.077099561691284 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01103595, + "balance_loss_clip": 1.00178039, + "balance_loss_mlp": 1.00059772, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.8409096289720936, + "language_loss": 0.7167086, + "learning_rate": 2.827714802616301e-07, + "loss": 0.73906124, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.5853421688079834 + }, + { + "auxiliary_loss_clip": 0.01131826, + "auxiliary_loss_mlp": 0.0110293, + "balance_loss_clip": 1.00168335, + "balance_loss_mlp": 1.00050592, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.4256931147881273, + "language_loss": 0.80490792, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82725549, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 2.6074836254119873 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00062585, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.4741946110255564, + "language_loss": 0.82637262, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84890151, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 2.5335614681243896 + }, + { + "auxiliary_loss_clip": 0.01129622, + "auxiliary_loss_mlp": 0.01104079, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.00041449, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.662860040331634, + "language_loss": 0.70229113, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72462815, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.5229523181915283 + }, + { + "auxiliary_loss_clip": 0.01148347, + "auxiliary_loss_mlp": 0.01102492, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.00054383, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.7277504587190828, + "language_loss": 0.68747658, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.70998502, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.541869640350342 + }, + { + "auxiliary_loss_clip": 0.01131743, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00045991, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.9054964565739745, + "language_loss": 0.73385167, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75620079, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.564756155014038 + }, + { + "auxiliary_loss_clip": 0.01131645, + "auxiliary_loss_mlp": 0.01104474, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00052357, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 3.7944305503377747, + "language_loss": 0.75632501, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77868623, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 2.5949456691741943 + }, + { + "auxiliary_loss_clip": 0.01133875, + "auxiliary_loss_mlp": 0.01103002, + "balance_loss_clip": 1.00186908, + "balance_loss_mlp": 1.00038648, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.6985657160497667, + "language_loss": 0.66149414, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68386292, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.6300342082977295 + }, + { + "auxiliary_loss_clip": 0.0110418, + "auxiliary_loss_mlp": 0.01102671, + "balance_loss_clip": 1.0017513, + "balance_loss_mlp": 1.00053215, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 4.492370149289248, + "language_loss": 0.79702139, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81908983, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.6573734283447266 + }, + { + "auxiliary_loss_clip": 0.01150122, + "auxiliary_loss_mlp": 0.01102048, + "balance_loss_clip": 1.00192201, + "balance_loss_mlp": 1.00057685, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.0812374979836643, + "language_loss": 0.87319154, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89571315, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.544663190841675 + }, + { + "auxiliary_loss_clip": 0.01116931, + "auxiliary_loss_mlp": 0.01101905, + "balance_loss_clip": 1.00188315, + "balance_loss_mlp": 1.0006249, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.7638384165677077, + "language_loss": 0.69338989, + "learning_rate": 2.807782702318828e-07, + "loss": 0.7155782, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.5681231021881104 + }, + { + "auxiliary_loss_clip": 0.01133531, + "auxiliary_loss_mlp": 0.01103315, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00050926, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 2.1307337399366606, + "language_loss": 0.79531848, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81768692, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.556769371032715 + }, + { + "auxiliary_loss_clip": 0.01098924, + "auxiliary_loss_mlp": 0.01102506, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.205658703538667, + "language_loss": 0.83778322, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85979748, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.6228420734405518 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01103679, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00039649, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.4749463921251555, + "language_loss": 0.78154063, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80391234, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.621710777282715 + }, + { + "auxiliary_loss_clip": 0.01117259, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00055051, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 5.376301066575215, + "language_loss": 0.77945119, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80165064, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.583068609237671 + }, + { + "auxiliary_loss_clip": 0.01118796, + "auxiliary_loss_mlp": 0.01102905, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00076652, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.836235584379601, + "language_loss": 0.80462813, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82684517, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.6449332237243652 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01103565, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00047231, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 1.9360835034452428, + "language_loss": 0.74438292, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76690096, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.6019322872161865 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.0110465, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.0005089, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 2.062539120754458, + "language_loss": 0.70270252, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72507077, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.6432268619537354 + }, + { + "auxiliary_loss_clip": 0.01115393, + "auxiliary_loss_mlp": 0.01102694, + "balance_loss_clip": 1.00162411, + "balance_loss_mlp": 1.00055504, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.6451244072546685, + "language_loss": 0.69490272, + "learning_rate": 2.791883957449912e-07, + "loss": 0.71708357, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 4.281404256820679 + }, + { + "auxiliary_loss_clip": 0.01116966, + "auxiliary_loss_mlp": 0.01101905, + "balance_loss_clip": 1.00169611, + "balance_loss_mlp": 1.00033867, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 2.111069219151427, + "language_loss": 0.79065728, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81284606, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.6443932056427 + }, + { + "auxiliary_loss_clip": 0.0113207, + "auxiliary_loss_mlp": 0.00747354, + "balance_loss_clip": 1.00174463, + "balance_loss_mlp": 1.00034046, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 1.9380452484673605, + "language_loss": 0.63693362, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.65572792, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.5799427032470703 + }, + { + "auxiliary_loss_clip": 0.0113207, + "auxiliary_loss_mlp": 0.01103449, + "balance_loss_clip": 1.00175107, + "balance_loss_mlp": 1.00035715, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 3.3363066297845663, + "language_loss": 0.67328739, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69564259, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.5322399139404297 + }, + { + "auxiliary_loss_clip": 0.01150516, + "auxiliary_loss_mlp": 0.01102523, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00038457, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 1.6909655997110424, + "language_loss": 0.68549395, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70802426, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.5371313095092773 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00067031, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.8146026698764355, + "language_loss": 0.58995354, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61231399, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.5407865047454834 + }, + { + "auxiliary_loss_clip": 0.01148168, + "auxiliary_loss_mlp": 0.01102849, + "balance_loss_clip": 1.00173485, + "balance_loss_mlp": 1.00042415, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.5904945254344403, + "language_loss": 0.7171787, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73968887, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.56502103805542 + }, + { + "auxiliary_loss_clip": 0.01165033, + "auxiliary_loss_mlp": 0.01102258, + "balance_loss_clip": 1.00188291, + "balance_loss_mlp": 1.00040519, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.5586983085787836, + "language_loss": 0.66019231, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68286526, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 2.5282421112060547 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01102782, + "balance_loss_clip": 1.00176966, + "balance_loss_mlp": 1.00045252, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.8761927259043607, + "language_loss": 0.78473067, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80705184, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.5555665493011475 + }, + { + "auxiliary_loss_clip": 0.01150538, + "auxiliary_loss_mlp": 0.01102499, + "balance_loss_clip": 1.00203383, + "balance_loss_mlp": 1.00064707, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6406724385160605, + "language_loss": 0.72302794, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74555838, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.538599729537964 + }, + { + "auxiliary_loss_clip": 0.01148382, + "auxiliary_loss_mlp": 0.01104265, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00069642, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 8.875244034692765, + "language_loss": 0.71540904, + "learning_rate": 2.772069258877667e-07, + "loss": 0.73793548, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.5452003479003906 + }, + { + "auxiliary_loss_clip": 0.01150151, + "auxiliary_loss_mlp": 0.01102683, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.1419022994684074, + "language_loss": 0.58491164, + "learning_rate": 2.770091380848423e-07, + "loss": 0.60743999, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.7994613647460938 + }, + { + "auxiliary_loss_clip": 0.01160479, + "auxiliary_loss_mlp": 0.00746513, + "balance_loss_clip": 1.00120831, + "balance_loss_mlp": 1.00130391, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6960063479438959, + "language_loss": 0.57646298, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59553289, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.1497297286987305 + }, + { + "auxiliary_loss_clip": 0.01145978, + "auxiliary_loss_mlp": 0.01104268, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.8310120745862615, + "language_loss": 0.79961324, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82211566, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.530292272567749 + }, + { + "auxiliary_loss_clip": 0.01165194, + "auxiliary_loss_mlp": 0.01103193, + "balance_loss_clip": 1.00197291, + "balance_loss_mlp": 1.00057817, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.6363000612133003, + "language_loss": 0.68991011, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71259403, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 4.10714316368103 + }, + { + "auxiliary_loss_clip": 0.01133552, + "auxiliary_loss_mlp": 0.01102204, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00044751, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.4563284580609912, + "language_loss": 0.70986843, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73222601, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.614248514175415 + }, + { + "auxiliary_loss_clip": 0.01101954, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_clip": 1.00160217, + "balance_loss_mlp": 1.0005101, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.082029288546491, + "language_loss": 0.79725581, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.81931424, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 4.0483503341674805 + }, + { + "auxiliary_loss_clip": 0.01149829, + "auxiliary_loss_mlp": 0.0110243, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00048232, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.625001157396264, + "language_loss": 0.62649345, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64901602, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.5503458976745605 + }, + { + "auxiliary_loss_clip": 0.01135441, + "auxiliary_loss_mlp": 0.01104492, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.9781578885240985, + "language_loss": 0.74363041, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76602972, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 2.601067304611206 + }, + { + "auxiliary_loss_clip": 0.01133856, + "auxiliary_loss_mlp": 0.01102486, + "balance_loss_clip": 1.00168669, + "balance_loss_mlp": 1.00053811, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.6309592669509536, + "language_loss": 0.72752196, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74988544, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 2.5503549575805664 + }, + { + "auxiliary_loss_clip": 0.01150405, + "auxiliary_loss_mlp": 0.01101997, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00062144, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.668428268555512, + "language_loss": 0.66655582, + "learning_rate": 2.752319888771e-07, + "loss": 0.68907982, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.6236753463745117 + }, + { + "auxiliary_loss_clip": 0.01148294, + "auxiliary_loss_mlp": 0.01103077, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00055718, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.397612162295584, + "language_loss": 0.74055696, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.7630707, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.5457301139831543 + }, + { + "auxiliary_loss_clip": 0.01114481, + "auxiliary_loss_mlp": 0.01103382, + "balance_loss_clip": 1.00162876, + "balance_loss_mlp": 1.00057554, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.8286886322223883, + "language_loss": 0.75249547, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77467412, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.653749942779541 + }, + { + "auxiliary_loss_clip": 0.01147859, + "auxiliary_loss_mlp": 0.0110432, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 1.9837716042812845, + "language_loss": 0.71921539, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.74173713, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.5741827487945557 + }, + { + "auxiliary_loss_clip": 0.01165213, + "auxiliary_loss_mlp": 0.00747424, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.0004108, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 1.9969690634344661, + "language_loss": 0.73544812, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75457448, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.4845669269561768 + }, + { + "auxiliary_loss_clip": 0.01150246, + "auxiliary_loss_mlp": 0.00747456, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00035596, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.8659680970483754, + "language_loss": 0.73467046, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75364751, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.526571273803711 + }, + { + "auxiliary_loss_clip": 0.01133556, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00060105, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 1.7878558999756269, + "language_loss": 0.7914598, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81382847, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.5562095642089844 + }, + { + "auxiliary_loss_clip": 0.01150281, + "auxiliary_loss_mlp": 0.01102633, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00049484, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.6285318065673084, + "language_loss": 0.78957438, + "learning_rate": 2.738534240246797e-07, + "loss": 0.81210351, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 2.5247325897216797 + }, + { + "auxiliary_loss_clip": 0.01148411, + "auxiliary_loss_mlp": 0.01104208, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00054312, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 2.123751514701181, + "language_loss": 0.73132372, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75384992, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.543038845062256 + }, + { + "auxiliary_loss_clip": 0.01100834, + "auxiliary_loss_mlp": 0.01102604, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.00046539, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.5377368438707244, + "language_loss": 0.71249723, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73453164, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.6525726318359375 + }, + { + "auxiliary_loss_clip": 0.01116825, + "auxiliary_loss_mlp": 0.01103339, + "balance_loss_clip": 1.00163317, + "balance_loss_mlp": 1.00043762, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.8088848881418698, + "language_loss": 0.72623265, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74843431, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.597362518310547 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.00747431, + "balance_loss_clip": 1.00175548, + "balance_loss_mlp": 1.00031781, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 2.7430812616751408, + "language_loss": 0.74963695, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76828015, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.5976667404174805 + }, + { + "auxiliary_loss_clip": 0.01165021, + "auxiliary_loss_mlp": 0.01102033, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00056219, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.462604440034361, + "language_loss": 0.79034066, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81301111, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.5242788791656494 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.00160575, + "balance_loss_mlp": 1.00062513, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.533251889834061, + "language_loss": 0.67621291, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.69823837, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 2.6932995319366455 + }, + { + "auxiliary_loss_clip": 0.01150033, + "auxiliary_loss_mlp": 0.01102517, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00037837, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.7646166395772902, + "language_loss": 0.73910993, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.76163542, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.5625367164611816 + }, + { + "auxiliary_loss_clip": 0.01135536, + "auxiliary_loss_mlp": 0.01102452, + "balance_loss_clip": 1.00174069, + "balance_loss_mlp": 1.00059974, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.6724042532635026, + "language_loss": 0.68758726, + "learning_rate": 2.722818488237566e-07, + "loss": 0.70996714, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.5907509326934814 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.01104005, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00053167, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 2.132172816663014, + "language_loss": 0.85002482, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87256527, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.5418343544006348 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.00747319, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00032163, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.7005005739358232, + "language_loss": 0.71542776, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73405308, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.7584400177001953 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01102565, + "balance_loss_clip": 1.00168228, + "balance_loss_mlp": 1.00052214, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.766020533714354, + "language_loss": 0.76118159, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78353643, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 4.072512626647949 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01101986, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00051522, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.4874332916261939, + "language_loss": 0.64346623, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66581237, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.6565473079681396 + }, + { + "auxiliary_loss_clip": 0.011331, + "auxiliary_loss_mlp": 0.01103566, + "balance_loss_clip": 1.00172424, + "balance_loss_mlp": 1.00047421, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 1.7123211021066835, + "language_loss": 0.74657136, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76893806, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.6226389408111572 + }, + { + "auxiliary_loss_clip": 0.01148547, + "auxiliary_loss_mlp": 0.01103371, + "balance_loss_clip": 1.00196183, + "balance_loss_mlp": 1.00056481, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.7085924145828324, + "language_loss": 0.7113989, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73391807, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.524060010910034 + }, + { + "auxiliary_loss_clip": 0.0112395, + "auxiliary_loss_mlp": 0.01080113, + "balance_loss_clip": 1.00119662, + "balance_loss_mlp": 1.00000441, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.708285577675594, + "language_loss": 0.58791637, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60995698, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.281595230102539 + }, + { + "auxiliary_loss_clip": 0.01116828, + "auxiliary_loss_mlp": 0.01103313, + "balance_loss_clip": 1.00194705, + "balance_loss_mlp": 1.0006026, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.6677594395358888, + "language_loss": 0.70008433, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72228575, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.618964910507202 + }, + { + "auxiliary_loss_clip": 0.01148447, + "auxiliary_loss_mlp": 0.01105162, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00044894, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.8778700365465943, + "language_loss": 0.67021382, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69274986, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.7145161628723145 + }, + { + "auxiliary_loss_clip": 0.01099185, + "auxiliary_loss_mlp": 0.01102871, + "balance_loss_clip": 1.00165832, + "balance_loss_mlp": 1.00044656, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.930056760630033, + "language_loss": 0.71447515, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73649573, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.6592392921447754 + }, + { + "auxiliary_loss_clip": 0.01119068, + "auxiliary_loss_mlp": 0.01103085, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00056458, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.4688114305043194, + "language_loss": 0.71920073, + "learning_rate": 2.701277800409705e-07, + "loss": 0.7414223, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.632236957550049 + }, + { + "auxiliary_loss_clip": 0.01067782, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_clip": 1.00150108, + "balance_loss_mlp": 1.00044763, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.110509701034037, + "language_loss": 0.66778791, + "learning_rate": 2.699323490393628e-07, + "loss": 0.6894877, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 2.7522056102752686 + }, + { + "auxiliary_loss_clip": 0.01133765, + "auxiliary_loss_mlp": 0.01102957, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00062752, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.859756694366236, + "language_loss": 0.76546395, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78783113, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.560288190841675 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01103225, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00060987, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.6751177849424244, + "language_loss": 0.7742734, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79679143, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.5848352909088135 + }, + { + "auxiliary_loss_clip": 0.0111524, + "auxiliary_loss_mlp": 0.01103931, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00045741, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 8.829132384614036, + "language_loss": 0.56435847, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58655012, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.5693893432617188 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.01103338, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00053191, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.8219358791751865, + "language_loss": 0.89803106, + "learning_rate": 2.691512811503882e-07, + "loss": 0.92056477, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.5069313049316406 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01103478, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00048089, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.887785978191804, + "language_loss": 0.81451219, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83704615, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 5.334834814071655 + }, + { + "auxiliary_loss_clip": 0.01150033, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.0005461, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.5637014977639492, + "language_loss": 0.7037943, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72633767, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.5852949619293213 + }, + { + "auxiliary_loss_clip": 0.01116396, + "auxiliary_loss_mlp": 0.0110373, + "balance_loss_clip": 1.00169241, + "balance_loss_mlp": 1.00073314, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.9510715184322895, + "language_loss": 0.76286757, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78506887, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 4.04033899307251 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00195873, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.9795854884331658, + "language_loss": 0.76397109, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78651202, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.6045045852661133 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01103944, + "balance_loss_clip": 1.00152469, + "balance_loss_mlp": 1.00047064, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.325891761015176, + "language_loss": 0.73445922, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75650102, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 2.683053731918335 + }, + { + "auxiliary_loss_clip": 0.01102403, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00050974, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.4160569437588657, + "language_loss": 0.79368716, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81575108, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.703171968460083 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01102888, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.0004636, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 2.4755947424200277, + "language_loss": 0.85318065, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87523091, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 2.6463050842285156 + }, + { + "auxiliary_loss_clip": 0.01139859, + "auxiliary_loss_mlp": 0.00746577, + "balance_loss_clip": 1.00117254, + "balance_loss_mlp": 1.00128698, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6200253839628205, + "language_loss": 0.50261128, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52147567, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.2440154552459717 + }, + { + "auxiliary_loss_clip": 0.01104022, + "auxiliary_loss_mlp": 0.01102696, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00055766, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 2.0616686185047923, + "language_loss": 0.65232491, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67439204, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.7173972129821777 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01102891, + "balance_loss_clip": 1.00164092, + "balance_loss_mlp": 1.00056136, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.5340183326869412, + "language_loss": 0.67625177, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69828427, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.736241102218628 + }, + { + "auxiliary_loss_clip": 0.01133697, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_clip": 1.00172698, + "balance_loss_mlp": 1.00045705, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.47136098714526, + "language_loss": 0.69662994, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.71899861, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.659147262573242 + }, + { + "auxiliary_loss_clip": 0.01131534, + "auxiliary_loss_mlp": 0.01101346, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00054252, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 1.8432803380250284, + "language_loss": 0.8485176, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87084639, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 2.6128668785095215 + }, + { + "auxiliary_loss_clip": 0.01131801, + "auxiliary_loss_mlp": 0.01102442, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00058913, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 9.242585903582784, + "language_loss": 0.70207793, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72442031, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.5899009704589844 + }, + { + "auxiliary_loss_clip": 0.01146035, + "auxiliary_loss_mlp": 0.01102856, + "balance_loss_clip": 1.00197732, + "balance_loss_mlp": 1.00062215, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.5811736986788318, + "language_loss": 0.64344084, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66592973, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.5175564289093018 + }, + { + "auxiliary_loss_clip": 0.01148732, + "auxiliary_loss_mlp": 0.01102654, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00042057, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.460743843412909, + "language_loss": 0.70052326, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72303718, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 2.603158473968506 + }, + { + "auxiliary_loss_clip": 0.01148316, + "auxiliary_loss_mlp": 0.01103224, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00051284, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 2.756724919737539, + "language_loss": 0.72468072, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.74719614, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.5355215072631836 + }, + { + "auxiliary_loss_clip": 0.01082989, + "auxiliary_loss_mlp": 0.0110356, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.00046778, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 2.139616442960543, + "language_loss": 0.6804359, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70230144, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.697667121887207 + }, + { + "auxiliary_loss_clip": 0.0113109, + "auxiliary_loss_mlp": 0.01102469, + "balance_loss_clip": 1.00177395, + "balance_loss_mlp": 1.00052106, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.6405116621510478, + "language_loss": 0.73435962, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75669527, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.553600788116455 + }, + { + "auxiliary_loss_clip": 0.010858, + "auxiliary_loss_mlp": 0.01103277, + "balance_loss_clip": 1.00154316, + "balance_loss_mlp": 1.00047064, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 3.7802144425149256, + "language_loss": 0.6640836, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68597436, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.6936681270599365 + }, + { + "auxiliary_loss_clip": 0.01148487, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00057054, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.8044997182400133, + "language_loss": 0.79622841, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81875277, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.5724687576293945 + }, + { + "auxiliary_loss_clip": 0.01084719, + "auxiliary_loss_mlp": 0.01081446, + "balance_loss_clip": 1.00188315, + "balance_loss_mlp": 1.00057459, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7635548273734969, + "language_loss": 0.53373641, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55539805, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.363556146621704 + }, + { + "auxiliary_loss_clip": 0.01148134, + "auxiliary_loss_mlp": 0.01102813, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00057888, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.977824526638822, + "language_loss": 0.73089057, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75340003, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 2.5271644592285156 + }, + { + "auxiliary_loss_clip": 0.01116797, + "auxiliary_loss_mlp": 0.01103678, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00039542, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.6805103431037478, + "language_loss": 0.5572648, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57946956, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.6748595237731934 + }, + { + "auxiliary_loss_clip": 0.01114996, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_clip": 1.00113356, + "balance_loss_mlp": 1.00004339, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.780120682075373, + "language_loss": 0.6070146, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62896222, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 4.743381500244141 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.00147748, + "balance_loss_mlp": 1.00052571, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.745121328115336, + "language_loss": 0.68399847, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70604891, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 2.6223602294921875 + }, + { + "auxiliary_loss_clip": 0.0111683, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_clip": 1.00164402, + "balance_loss_mlp": 1.00032628, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.9375579076200538, + "language_loss": 0.73597085, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75817239, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.6142160892486572 + }, + { + "auxiliary_loss_clip": 0.01133789, + "auxiliary_loss_mlp": 0.01102769, + "balance_loss_clip": 1.00180817, + "balance_loss_mlp": 1.00053549, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 2.6931450060416977, + "language_loss": 0.76648629, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78885186, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.6343765258789062 + }, + { + "auxiliary_loss_clip": 0.01131595, + "auxiliary_loss_mlp": 0.0110384, + "balance_loss_clip": 1.00166142, + "balance_loss_mlp": 1.00055742, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.877519918713862, + "language_loss": 0.78112161, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80347592, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.569178819656372 + }, + { + "auxiliary_loss_clip": 0.01148557, + "auxiliary_loss_mlp": 0.01102284, + "balance_loss_clip": 1.0016346, + "balance_loss_mlp": 1.00052738, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.6641675599778682, + "language_loss": 0.66006804, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68257648, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.654802083969116 + }, + { + "auxiliary_loss_clip": 0.01131959, + "auxiliary_loss_mlp": 0.01102762, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.00052869, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.518823865412905, + "language_loss": 0.74430919, + "learning_rate": 2.633267779230177e-07, + "loss": 0.7666564, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 2.6141481399536133 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01103214, + "balance_loss_clip": 1.00161827, + "balance_loss_mlp": 1.0004077, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 1.806120486755241, + "language_loss": 0.83047414, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85282457, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.549039840698242 + }, + { + "auxiliary_loss_clip": 0.01129461, + "auxiliary_loss_mlp": 0.01102917, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.0005877, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 1.796017024793088, + "language_loss": 0.77397561, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79629934, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.5472497940063477 + }, + { + "auxiliary_loss_clip": 0.01133489, + "auxiliary_loss_mlp": 0.01104838, + "balance_loss_clip": 1.00163627, + "balance_loss_mlp": 1.00031567, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 1.9407059039865486, + "language_loss": 0.77070326, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79308653, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.600641965866089 + }, + { + "auxiliary_loss_clip": 0.01131657, + "auxiliary_loss_mlp": 0.01103826, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00054276, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 3.1354819559211022, + "language_loss": 0.72077942, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74313426, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.5888845920562744 + }, + { + "auxiliary_loss_clip": 0.01133651, + "auxiliary_loss_mlp": 0.01103135, + "balance_loss_clip": 1.00184572, + "balance_loss_mlp": 1.00051999, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 3.070816642980719, + "language_loss": 0.77194577, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79431361, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.545156955718994 + }, + { + "auxiliary_loss_clip": 0.01084812, + "auxiliary_loss_mlp": 0.01102489, + "balance_loss_clip": 1.00169396, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.447912104387998, + "language_loss": 0.68392229, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70579529, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.707808017730713 + }, + { + "auxiliary_loss_clip": 0.01131499, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.00162721, + "balance_loss_mlp": 1.00047135, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 2.419319616682659, + "language_loss": 0.78229904, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80464196, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.55297589302063 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01102946, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.0005213, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.435386995348037, + "language_loss": 0.72485876, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74737525, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 4.104823350906372 + }, + { + "auxiliary_loss_clip": 0.01135075, + "auxiliary_loss_mlp": 0.0110332, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.0004189, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 2.114682787288946, + "language_loss": 0.72195435, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74433827, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 3.9868109226226807 + }, + { + "auxiliary_loss_clip": 0.01165025, + "auxiliary_loss_mlp": 0.00747302, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00031567, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.6488693956978415, + "language_loss": 0.72197562, + "learning_rate": 2.61398438016311e-07, + "loss": 0.74109888, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.5174014568328857 + }, + { + "auxiliary_loss_clip": 0.01148304, + "auxiliary_loss_mlp": 0.01102046, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00047934, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.3465661856757876, + "language_loss": 0.68439829, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70690179, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.656057834625244 + }, + { + "auxiliary_loss_clip": 0.01118728, + "auxiliary_loss_mlp": 0.01101429, + "balance_loss_clip": 1.00164962, + "balance_loss_mlp": 1.00062609, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.6444719314565, + "language_loss": 0.77952814, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80172968, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.5842695236206055 + }, + { + "auxiliary_loss_clip": 0.01148722, + "auxiliary_loss_mlp": 0.01103429, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00043178, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 1.793830086732585, + "language_loss": 0.77939355, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80191505, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 2.5152199268341064 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.01103328, + "balance_loss_clip": 1.00187206, + "balance_loss_mlp": 1.00052214, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.5482202957107725, + "language_loss": 0.86481482, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88701808, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 2.6739299297332764 + }, + { + "auxiliary_loss_clip": 0.01150366, + "auxiliary_loss_mlp": 0.01103398, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00059223, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 1.8592801828460341, + "language_loss": 0.67166269, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.69420028, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 2.562751531600952 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00066853, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.598235129675153, + "language_loss": 0.68144119, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70349032, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.7292401790618896 + }, + { + "auxiliary_loss_clip": 0.01118661, + "auxiliary_loss_mlp": 0.01103547, + "balance_loss_clip": 1.00172055, + "balance_loss_mlp": 1.00064516, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 2.777716599530939, + "language_loss": 0.78730154, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.80952364, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.678717851638794 + }, + { + "auxiliary_loss_clip": 0.01148248, + "auxiliary_loss_mlp": 0.01102682, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.00044823, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 1.867168871013069, + "language_loss": 0.60286975, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62537903, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.568699359893799 + }, + { + "auxiliary_loss_clip": 0.01103691, + "auxiliary_loss_mlp": 0.01103429, + "balance_loss_clip": 1.00158525, + "balance_loss_mlp": 1.0005275, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.7163828928913964, + "language_loss": 0.81659162, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83866286, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.6893138885498047 + }, + { + "auxiliary_loss_clip": 0.01131869, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00029945, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 2.017812440319397, + "language_loss": 0.65929401, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67808604, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.6484086513519287 + }, + { + "auxiliary_loss_clip": 0.01165016, + "auxiliary_loss_mlp": 0.00747382, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.00041032, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 3.2840701577558224, + "language_loss": 0.67703879, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69616282, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.5517232418060303 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.01104542, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00059175, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.0854772258933614, + "language_loss": 0.81190395, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83443755, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.5175669193267822 + }, + { + "auxiliary_loss_clip": 0.01150353, + "auxiliary_loss_mlp": 0.01103545, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.00045264, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.6906445521601874, + "language_loss": 0.7537756, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77631456, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.593385696411133 + }, + { + "auxiliary_loss_clip": 0.01148495, + "auxiliary_loss_mlp": 0.01102704, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.0006609, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.5178501561938322, + "language_loss": 0.8038528, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82636476, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 2.5429487228393555 + }, + { + "auxiliary_loss_clip": 0.01114657, + "auxiliary_loss_mlp": 0.01102688, + "balance_loss_clip": 1.00160933, + "balance_loss_mlp": 1.00054979, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.064067210925555, + "language_loss": 0.70561731, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72779071, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.625443696975708 + }, + { + "auxiliary_loss_clip": 0.01118569, + "auxiliary_loss_mlp": 0.01103046, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.00043011, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 18.67185016412252, + "language_loss": 0.76579058, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78800678, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.6219375133514404 + }, + { + "auxiliary_loss_clip": 0.01150452, + "auxiliary_loss_mlp": 0.01104379, + "balance_loss_clip": 1.00185144, + "balance_loss_mlp": 1.00052357, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 1.8563774222578435, + "language_loss": 0.7435509, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76609921, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.5873477458953857 + }, + { + "auxiliary_loss_clip": 0.01150258, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00046718, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.4766112603741017, + "language_loss": 0.59371364, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.6162442, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.510258913040161 + }, + { + "auxiliary_loss_clip": 0.01147802, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_clip": 1.0017401, + "balance_loss_mlp": 1.00056994, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.6842586280358405, + "language_loss": 0.72049332, + "learning_rate": 2.577527613603163e-07, + "loss": 0.7429955, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.5594723224639893 + }, + { + "auxiliary_loss_clip": 0.01133716, + "auxiliary_loss_mlp": 0.01102386, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00048614, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.6842040356023322, + "language_loss": 0.64364803, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66600907, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.5856800079345703 + }, + { + "auxiliary_loss_clip": 0.01132463, + "auxiliary_loss_mlp": 0.01104543, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00049722, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 1.7834191278643383, + "language_loss": 0.82033116, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.8427012, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.5998637676239014 + }, + { + "auxiliary_loss_clip": 0.01148413, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00035, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.4881430275024337, + "language_loss": 0.8012957, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82025445, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 4.020295858383179 + }, + { + "auxiliary_loss_clip": 0.01150559, + "auxiliary_loss_mlp": 0.01103493, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00049615, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 2.356552688958383, + "language_loss": 0.66789442, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69043493, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.5916435718536377 + }, + { + "auxiliary_loss_clip": 0.01149941, + "auxiliary_loss_mlp": 0.01104395, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00054014, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.4505058555410726, + "language_loss": 0.79504573, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81758916, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.580256223678589 + }, + { + "auxiliary_loss_clip": 0.01084111, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00044537, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.5895269453316112, + "language_loss": 0.78799278, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80985779, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.6960325241088867 + }, + { + "auxiliary_loss_clip": 0.01098086, + "auxiliary_loss_mlp": 0.00747271, + "balance_loss_clip": 1.00154769, + "balance_loss_mlp": 1.00030565, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.3723651835044601, + "language_loss": 0.7807911, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.79924464, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.7301628589630127 + }, + { + "auxiliary_loss_clip": 0.01132, + "auxiliary_loss_mlp": 0.01103855, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00047636, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.765473199396545, + "language_loss": 0.65312648, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67548507, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.5877914428710938 + }, + { + "auxiliary_loss_clip": 0.01150487, + "auxiliary_loss_mlp": 0.01104056, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00048649, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.8247076076141395, + "language_loss": 0.75701416, + "learning_rate": 2.560341831785724e-07, + "loss": 0.77955955, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.571829319000244 + }, + { + "auxiliary_loss_clip": 0.0111695, + "auxiliary_loss_mlp": 0.00747347, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.5473805761284278, + "language_loss": 0.77637434, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79501724, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.591392755508423 + }, + { + "auxiliary_loss_clip": 0.01148359, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00063121, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.7001205983033187, + "language_loss": 0.77254969, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79506385, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.504688024520874 + }, + { + "auxiliary_loss_clip": 0.01129445, + "auxiliary_loss_mlp": 0.01103022, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00040627, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 2.2338183390697517, + "language_loss": 0.66005886, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68238354, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.632824659347534 + }, + { + "auxiliary_loss_clip": 0.01144005, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_clip": 1.00118911, + "balance_loss_mlp": 1.00000572, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7055761128332005, + "language_loss": 0.56947803, + "learning_rate": 2.552720897550631e-07, + "loss": 0.59171158, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.2058095932006836 + }, + { + "auxiliary_loss_clip": 0.01081505, + "auxiliary_loss_mlp": 0.01101941, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00056565, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.2117611290848735, + "language_loss": 0.78028435, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80211878, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 2.7416419982910156 + }, + { + "auxiliary_loss_clip": 0.01165071, + "auxiliary_loss_mlp": 0.01104705, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00056374, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.7640678056933192, + "language_loss": 0.724994, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74769175, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.4943861961364746 + }, + { + "auxiliary_loss_clip": 0.01148584, + "auxiliary_loss_mlp": 0.01103758, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00066566, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.872962077415223, + "language_loss": 0.84231985, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86484325, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.5515644550323486 + }, + { + "auxiliary_loss_clip": 0.01164803, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.0005722, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.4758110205112183, + "language_loss": 0.68210733, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70477581, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 5.381823301315308 + }, + { + "auxiliary_loss_clip": 0.01165265, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00043404, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 2.191110181365813, + "language_loss": 0.79372549, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81641626, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 3.862631320953369 + }, + { + "auxiliary_loss_clip": 0.01118525, + "auxiliary_loss_mlp": 0.00747217, + "balance_loss_clip": 1.00171065, + "balance_loss_mlp": 1.00044155, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.5782724464676978, + "language_loss": 0.670623, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.68928039, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.6388449668884277 + }, + { + "auxiliary_loss_clip": 0.01165096, + "auxiliary_loss_mlp": 0.01103301, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00039935, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.4193352896687403, + "language_loss": 0.75873387, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78141785, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.5249812602996826 + }, + { + "auxiliary_loss_clip": 0.01133772, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.00184786, + "balance_loss_mlp": 1.00055563, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 1.9053026493931515, + "language_loss": 0.79154253, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81391102, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.5739376544952393 + }, + { + "auxiliary_loss_clip": 0.01131785, + "auxiliary_loss_mlp": 0.01102764, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00062513, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 1.8740628080230304, + "language_loss": 0.62738895, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64973444, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.521470069885254 + }, + { + "auxiliary_loss_clip": 0.01148329, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.7753933160624156, + "language_loss": 0.79313529, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81564426, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.546038866043091 + }, + { + "auxiliary_loss_clip": 0.01118914, + "auxiliary_loss_mlp": 0.01103213, + "balance_loss_clip": 1.00181711, + "balance_loss_mlp": 1.00050259, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7221009511753564, + "language_loss": 0.78397924, + "learning_rate": 2.531817924498265e-07, + "loss": 0.8062005, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.663264274597168 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01103041, + "balance_loss_clip": 1.00165701, + "balance_loss_mlp": 1.00042534, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.6560792549530003, + "language_loss": 0.71170551, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.7342214, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 2.5145063400268555 + }, + { + "auxiliary_loss_clip": 0.01129716, + "auxiliary_loss_mlp": 0.01104167, + "balance_loss_clip": 1.00209916, + "balance_loss_mlp": 1.00059783, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.5456603414796704, + "language_loss": 0.69854665, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.72088546, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.621074676513672 + }, + { + "auxiliary_loss_clip": 0.01087402, + "auxiliary_loss_mlp": 0.01104363, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00050807, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.7538829015093753, + "language_loss": 0.72090578, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74282348, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.7054033279418945 + }, + { + "auxiliary_loss_clip": 0.01147834, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00180078, + "balance_loss_mlp": 1.00060511, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.5831117440800142, + "language_loss": 0.66596925, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68848455, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 2.596041440963745 + }, + { + "auxiliary_loss_clip": 0.0114824, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.0004425, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.721271089814605, + "language_loss": 0.80527788, + "learning_rate": 2.522343063158261e-07, + "loss": 0.827788, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.511131525039673 + }, + { + "auxiliary_loss_clip": 0.01148152, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00052285, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4359022995145845, + "language_loss": 0.77620161, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79870307, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.5297904014587402 + }, + { + "auxiliary_loss_clip": 0.01135792, + "auxiliary_loss_mlp": 0.01103095, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00076544, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.4366793334358898, + "language_loss": 0.82687533, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84926414, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.584933280944824 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.01103121, + "balance_loss_clip": 1.00160873, + "balance_loss_mlp": 1.00041068, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.346253294311702, + "language_loss": 0.56687301, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58923829, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 2.7269506454467773 + }, + { + "auxiliary_loss_clip": 0.01133655, + "auxiliary_loss_mlp": 0.01103384, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00048244, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 1.9235110176672527, + "language_loss": 0.63380492, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65617532, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 2.5896527767181396 + }, + { + "auxiliary_loss_clip": 0.01164978, + "auxiliary_loss_mlp": 0.01102627, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00067961, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.7413250934980118, + "language_loss": 0.75172377, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77439982, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 2.50140380859375 + }, + { + "auxiliary_loss_clip": 0.01133598, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00043201, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 2.1321562987389053, + "language_loss": 0.83060861, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85297126, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.5821216106414795 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.01102734, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.0004046, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.6881804695398939, + "language_loss": 0.80151844, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82387686, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.570241928100586 + }, + { + "auxiliary_loss_clip": 0.0111677, + "auxiliary_loss_mlp": 0.01102747, + "balance_loss_clip": 1.00168812, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.3641884721906172, + "language_loss": 0.75882339, + "learning_rate": 2.507217751976478e-07, + "loss": 0.78101856, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.6351616382598877 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01102836, + "balance_loss_clip": 1.00173903, + "balance_loss_mlp": 1.00060201, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.6724280423626328, + "language_loss": 0.83430099, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85649669, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.636871576309204 + }, + { + "auxiliary_loss_clip": 0.01118659, + "auxiliary_loss_mlp": 0.01103053, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.0004375, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4095408467212436, + "language_loss": 0.78659058, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80880773, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.6936700344085693 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01103452, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00055027, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.4364729845762307, + "language_loss": 0.72436547, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74690354, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.6458115577697754 + }, + { + "auxiliary_loss_clip": 0.01164863, + "auxiliary_loss_mlp": 0.0110141, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00051129, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.6657567841047272, + "language_loss": 0.69873589, + "learning_rate": 2.49967101396557e-07, + "loss": 0.72139859, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 3.930788516998291 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.01102142, + "balance_loss_clip": 1.00180757, + "balance_loss_mlp": 1.00047994, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.6427370023017747, + "language_loss": 0.69102079, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71369076, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.5975775718688965 + }, + { + "auxiliary_loss_clip": 0.01100109, + "auxiliary_loss_mlp": 0.01102745, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00051129, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.5026921931083184, + "language_loss": 0.76325768, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78528619, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.6744296550750732 + }, + { + "auxiliary_loss_clip": 0.01165223, + "auxiliary_loss_mlp": 0.01103793, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00041509, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 1.8659959469307696, + "language_loss": 0.79155755, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81424773, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 2.4854354858398438 + }, + { + "auxiliary_loss_clip": 0.01117306, + "auxiliary_loss_mlp": 0.01102536, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.00049281, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.000420905208907, + "language_loss": 0.69125479, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71345323, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.577970504760742 + }, + { + "auxiliary_loss_clip": 0.01133801, + "auxiliary_loss_mlp": 0.01103055, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00063014, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.987871743318694, + "language_loss": 0.6887815, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71115005, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.578824758529663 + }, + { + "auxiliary_loss_clip": 0.0113156, + "auxiliary_loss_mlp": 0.01102793, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00055885, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.6605837481202883, + "language_loss": 0.74740577, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76974928, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.5854828357696533 + }, + { + "auxiliary_loss_clip": 0.01165034, + "auxiliary_loss_mlp": 0.00747426, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00038004, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 1.9819856050472937, + "language_loss": 0.72150224, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74062687, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.4913601875305176 + }, + { + "auxiliary_loss_clip": 0.01132527, + "auxiliary_loss_mlp": 0.01102022, + "balance_loss_clip": 1.0016551, + "balance_loss_mlp": 1.00045586, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.742344330522328, + "language_loss": 0.7511698, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77351534, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.566866159439087 + }, + { + "auxiliary_loss_clip": 0.01135091, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00037885, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.80367449975912, + "language_loss": 0.78337324, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80219793, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.5581369400024414 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01103423, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00052142, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 2.062965626395779, + "language_loss": 0.7811389, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80350709, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.576798439025879 + }, + { + "auxiliary_loss_clip": 0.0113382, + "auxiliary_loss_mlp": 0.0110273, + "balance_loss_clip": 1.00172019, + "balance_loss_mlp": 1.00049639, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 3.979722253893837, + "language_loss": 0.72229999, + "learning_rate": 2.478972246355935e-07, + "loss": 0.7446655, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.650219202041626 + }, + { + "auxiliary_loss_clip": 0.01051167, + "auxiliary_loss_mlp": 0.011023, + "balance_loss_clip": 1.00133061, + "balance_loss_mlp": 1.00063848, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.5549679341249856, + "language_loss": 0.73388433, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75541902, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 2.7916057109832764 + }, + { + "auxiliary_loss_clip": 0.01144043, + "auxiliary_loss_mlp": 0.00746415, + "balance_loss_clip": 1.00121057, + "balance_loss_mlp": 1.00110006, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.7976751793973972, + "language_loss": 0.60637707, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62528169, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 5.848540306091309 + }, + { + "auxiliary_loss_clip": 0.01135289, + "auxiliary_loss_mlp": 0.00747368, + "balance_loss_clip": 1.00173998, + "balance_loss_mlp": 1.0003798, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.6438079121323064, + "language_loss": 0.71965384, + "learning_rate": 2.473341076306303e-07, + "loss": 0.73848045, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.6073765754699707 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.01102013, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00044692, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 4.946406185177671, + "language_loss": 0.74545991, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76796162, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 4.207874298095703 + }, + { + "auxiliary_loss_clip": 0.01133837, + "auxiliary_loss_mlp": 0.0110154, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00049841, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 2.1252211524905276, + "language_loss": 0.73638415, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75873792, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.5827529430389404 + }, + { + "auxiliary_loss_clip": 0.01150502, + "auxiliary_loss_mlp": 0.01102751, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00042224, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.6042275075482917, + "language_loss": 0.74120539, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76373792, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.5502309799194336 + }, + { + "auxiliary_loss_clip": 0.01150062, + "auxiliary_loss_mlp": 0.01103332, + "balance_loss_clip": 1.00180304, + "balance_loss_mlp": 1.00052547, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.3488680665243358, + "language_loss": 0.78207242, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80460644, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.6490354537963867 + }, + { + "auxiliary_loss_clip": 0.01148298, + "auxiliary_loss_mlp": 0.01101857, + "balance_loss_clip": 1.00184047, + "balance_loss_mlp": 1.00057697, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.7965633111415846, + "language_loss": 0.72894061, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75144219, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.571100950241089 + }, + { + "auxiliary_loss_clip": 0.01150541, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00061941, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.467009356348356, + "language_loss": 0.67503792, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69758612, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 2.5098657608032227 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01103385, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00038803, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.8268569505815127, + "language_loss": 0.7748915, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79694825, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.7152721881866455 + }, + { + "auxiliary_loss_clip": 0.01165145, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00066614, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.453884671097878, + "language_loss": 0.69802725, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72071147, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.550722360610962 + }, + { + "auxiliary_loss_clip": 0.01165278, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00055361, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.6099055676988296, + "language_loss": 0.57327282, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59597158, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.476691961288452 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.0110447, + "balance_loss_clip": 1.00174987, + "balance_loss_mlp": 1.00070977, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.7359905902278527, + "language_loss": 0.76132572, + "learning_rate": 2.454613720076277e-07, + "loss": 0.78370798, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.5805273056030273 + }, + { + "auxiliary_loss_clip": 0.0113147, + "auxiliary_loss_mlp": 0.01104191, + "balance_loss_clip": 1.00195229, + "balance_loss_mlp": 1.00043082, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.1149634881217025, + "language_loss": 0.70788646, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73024309, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.577972888946533 + }, + { + "auxiliary_loss_clip": 0.01097979, + "auxiliary_loss_mlp": 0.01081275, + "balance_loss_clip": 1.00155282, + "balance_loss_mlp": 1.0004034, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6312860373501441, + "language_loss": 0.52621889, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54801142, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.3086066246032715 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_clip": 1.00189435, + "balance_loss_mlp": 1.00058627, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.9043977111661412, + "language_loss": 0.82035655, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84254545, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 2.7596120834350586 + }, + { + "auxiliary_loss_clip": 0.01148629, + "auxiliary_loss_mlp": 0.01103485, + "balance_loss_clip": 1.00199354, + "balance_loss_mlp": 1.00067937, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.9510027760196866, + "language_loss": 0.72562534, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74814647, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.550325393676758 + }, + { + "auxiliary_loss_clip": 0.01116761, + "auxiliary_loss_mlp": 0.0110122, + "balance_loss_clip": 1.00168228, + "balance_loss_mlp": 1.00056016, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.8267361065232306, + "language_loss": 0.77657104, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79875088, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.681007146835327 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.0110367, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00057828, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.4470525867914095, + "language_loss": 0.69875622, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72093916, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.628959894180298 + }, + { + "auxiliary_loss_clip": 0.01117842, + "auxiliary_loss_mlp": 0.01102171, + "balance_loss_clip": 1.00169301, + "balance_loss_mlp": 1.00050902, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 1.69044019226612, + "language_loss": 0.71060061, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73280072, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.721984386444092 + }, + { + "auxiliary_loss_clip": 0.01109481, + "auxiliary_loss_mlp": 0.0108012, + "balance_loss_clip": 1.00120628, + "balance_loss_mlp": 1.00001132, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6965980821864967, + "language_loss": 0.6055131, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.6274091, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 3.2827022075653076 + }, + { + "auxiliary_loss_clip": 0.01133894, + "auxiliary_loss_mlp": 0.01102821, + "balance_loss_clip": 1.00175893, + "balance_loss_mlp": 1.00058663, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.5664677004008658, + "language_loss": 0.74411947, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76648664, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.6011111736297607 + }, + { + "auxiliary_loss_clip": 0.01100249, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_clip": 1.00160265, + "balance_loss_mlp": 1.00049448, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.6715199895282296, + "language_loss": 0.67310071, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69512856, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.7932870388031006 + }, + { + "auxiliary_loss_clip": 0.01143692, + "auxiliary_loss_mlp": 0.00746528, + "balance_loss_clip": 1.00108135, + "balance_loss_mlp": 1.00107408, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7309979486714258, + "language_loss": 0.61031353, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62921584, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 2.975945472717285 + }, + { + "auxiliary_loss_clip": 0.01115559, + "auxiliary_loss_mlp": 0.01103484, + "balance_loss_clip": 1.00160122, + "balance_loss_mlp": 1.00048721, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.7227051214605673, + "language_loss": 0.72346461, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.745655, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.6311306953430176 + }, + { + "auxiliary_loss_clip": 0.01131453, + "auxiliary_loss_mlp": 0.01104619, + "balance_loss_clip": 1.00171494, + "balance_loss_mlp": 1.00047815, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 1.6163786987905788, + "language_loss": 0.78019357, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80255425, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 4.1431849002838135 + }, + { + "auxiliary_loss_clip": 0.01148325, + "auxiliary_loss_mlp": 0.01103005, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00058019, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 1.9662834684125372, + "language_loss": 0.7495904, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77210367, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 2.5948193073272705 + }, + { + "auxiliary_loss_clip": 0.01131575, + "auxiliary_loss_mlp": 0.01102465, + "balance_loss_clip": 1.00176251, + "balance_loss_mlp": 1.00042176, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 1.9887100981763526, + "language_loss": 0.7309953, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75333565, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 2.5700366497039795 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01104085, + "balance_loss_clip": 1.00180256, + "balance_loss_mlp": 1.00042093, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 1.8649849784989938, + "language_loss": 0.77649492, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79887307, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.5828194618225098 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01104288, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00052845, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 1.9862028044775564, + "language_loss": 0.74847317, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77068627, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.583855628967285 + }, + { + "auxiliary_loss_clip": 0.0110181, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.00164711, + "balance_loss_mlp": 1.00037992, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.3020280089282132, + "language_loss": 0.85404253, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87608391, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 2.6198623180389404 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_clip": 1.00175798, + "balance_loss_mlp": 1.0005753, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.63881233979551, + "language_loss": 0.58813268, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61036056, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.615663528442383 + }, + { + "auxiliary_loss_clip": 0.01132132, + "auxiliary_loss_mlp": 0.01104162, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00040174, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.0038849178889944, + "language_loss": 0.65627217, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.67863512, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.604377508163452 + }, + { + "auxiliary_loss_clip": 0.01150196, + "auxiliary_loss_mlp": 0.01103825, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.00054216, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.7570746821504464, + "language_loss": 0.72603273, + "learning_rate": 2.41550291894576e-07, + "loss": 0.74857295, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.558405876159668 + }, + { + "auxiliary_loss_clip": 0.01102154, + "auxiliary_loss_mlp": 0.01103073, + "balance_loss_clip": 1.00146461, + "balance_loss_mlp": 1.00055301, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 2.909332565823504, + "language_loss": 0.75857234, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78062457, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.637301206588745 + }, + { + "auxiliary_loss_clip": 0.01100636, + "auxiliary_loss_mlp": 0.01104481, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00062573, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.697619047855327, + "language_loss": 0.65990448, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68195564, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.7068428993225098 + }, + { + "auxiliary_loss_clip": 0.01099849, + "auxiliary_loss_mlp": 0.011024, + "balance_loss_clip": 1.00167561, + "balance_loss_mlp": 1.00054765, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 3.0082008967225735, + "language_loss": 0.69746447, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71948701, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.6260619163513184 + }, + { + "auxiliary_loss_clip": 0.01100774, + "auxiliary_loss_mlp": 0.01102284, + "balance_loss_clip": 1.00152802, + "balance_loss_mlp": 1.00033665, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.4458388181349495, + "language_loss": 0.71179777, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73382837, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.6876919269561768 + }, + { + "auxiliary_loss_clip": 0.011506, + "auxiliary_loss_mlp": 0.01102791, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00055671, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.8086684772248076, + "language_loss": 0.74915123, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77168512, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.546415328979492 + }, + { + "auxiliary_loss_clip": 0.01128947, + "auxiliary_loss_mlp": 0.01103558, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00046563, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4045825816150046, + "language_loss": 0.73871392, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76103902, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 5.497511625289917 + }, + { + "auxiliary_loss_clip": 0.01148449, + "auxiliary_loss_mlp": 0.01103667, + "balance_loss_clip": 1.00186551, + "balance_loss_mlp": 1.00057459, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 4.603145697403344, + "language_loss": 0.71908551, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74160659, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 3.959373950958252 + }, + { + "auxiliary_loss_clip": 0.01165112, + "auxiliary_loss_mlp": 0.01103362, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 1.0005554, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.3813276749852268, + "language_loss": 0.79258734, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81527209, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.5211730003356934 + }, + { + "auxiliary_loss_clip": 0.01085657, + "auxiliary_loss_mlp": 0.01103623, + "balance_loss_clip": 1.00168049, + "balance_loss_mlp": 1.00053132, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.031914275469982, + "language_loss": 0.76282239, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.78471518, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.6708455085754395 + }, + { + "auxiliary_loss_clip": 0.01160453, + "auxiliary_loss_mlp": 0.01079714, + "balance_loss_clip": 1.00121462, + "balance_loss_mlp": 0.99998647, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.820362430547261, + "language_loss": 0.59415793, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61655962, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.147239923477173 + }, + { + "auxiliary_loss_clip": 0.01133687, + "auxiliary_loss_mlp": 0.01103982, + "balance_loss_clip": 1.00175035, + "balance_loss_mlp": 1.00060368, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 2.4243239685499547, + "language_loss": 0.70091587, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72329259, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.5738563537597656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01102366, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00060868, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 1.8372687952262403, + "language_loss": 0.83088887, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85356176, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.4866089820861816 + }, + { + "auxiliary_loss_clip": 0.01145854, + "auxiliary_loss_mlp": 0.01102662, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00061893, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.6175211878316402, + "language_loss": 0.71007389, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73255903, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.5890536308288574 + }, + { + "auxiliary_loss_clip": 0.01148296, + "auxiliary_loss_mlp": 0.00747291, + "balance_loss_clip": 1.00177634, + "balance_loss_mlp": 1.00035763, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.61491811287842, + "language_loss": 0.81026828, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82922417, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.6252119541168213 + }, + { + "auxiliary_loss_clip": 0.01148559, + "auxiliary_loss_mlp": 0.01103285, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00047863, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.9127271180442715, + "language_loss": 0.77246583, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79498428, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.5842103958129883 + }, + { + "auxiliary_loss_clip": 0.01118041, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_clip": 1.00170422, + "balance_loss_mlp": 1.00047898, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.639610808316672, + "language_loss": 0.80356741, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82577586, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.605461359024048 + }, + { + "auxiliary_loss_clip": 0.01150691, + "auxiliary_loss_mlp": 0.0074737, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00035357, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 2.2722943654795102, + "language_loss": 0.71908712, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73806775, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.539318561553955 + }, + { + "auxiliary_loss_clip": 0.01150535, + "auxiliary_loss_mlp": 0.01103588, + "balance_loss_clip": 1.00189221, + "balance_loss_mlp": 1.00049543, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 2.2002521227960563, + "language_loss": 0.63702518, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.6595664, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.6114754676818848 + }, + { + "auxiliary_loss_clip": 0.01148314, + "auxiliary_loss_mlp": 0.01103703, + "balance_loss_clip": 1.00175107, + "balance_loss_mlp": 1.00051594, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 3.0949336089391397, + "language_loss": 0.74041522, + "learning_rate": 2.380370324111085e-07, + "loss": 0.76293534, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.5431134700775146 + }, + { + "auxiliary_loss_clip": 0.01149859, + "auxiliary_loss_mlp": 0.0110364, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00045252, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 4.306734334622138, + "language_loss": 0.7164048, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73893976, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.597590208053589 + }, + { + "auxiliary_loss_clip": 0.01134057, + "auxiliary_loss_mlp": 0.01103633, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00044537, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.693418318152198, + "language_loss": 0.81187737, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83425432, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.542935371398926 + }, + { + "auxiliary_loss_clip": 0.01165124, + "auxiliary_loss_mlp": 0.01103601, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00060427, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 1.901118088066207, + "language_loss": 0.78689861, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80958593, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 2.486741304397583 + }, + { + "auxiliary_loss_clip": 0.01148371, + "auxiliary_loss_mlp": 0.01104053, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.0005796, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 1.9362339879305759, + "language_loss": 0.78975582, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81228012, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.521436929702759 + }, + { + "auxiliary_loss_clip": 0.01133789, + "auxiliary_loss_mlp": 0.01104547, + "balance_loss_clip": 1.00166965, + "balance_loss_mlp": 1.00050151, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 3.286035718210669, + "language_loss": 0.50619954, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52858287, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.5668108463287354 + }, + { + "auxiliary_loss_clip": 0.01114646, + "auxiliary_loss_mlp": 0.01102654, + "balance_loss_clip": 1.00156522, + "balance_loss_mlp": 1.00051546, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 2.525314682807979, + "language_loss": 0.7545771, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77675009, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.6104748249053955 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.0110245, + "balance_loss_clip": 1.0016216, + "balance_loss_mlp": 1.00040674, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5339165440394393, + "language_loss": 0.73107511, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.7532624, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 2.7113139629364014 + }, + { + "auxiliary_loss_clip": 0.01164882, + "auxiliary_loss_mlp": 0.01103073, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00055313, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.582909807516568, + "language_loss": 0.72827709, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.75095665, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.524679660797119 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.01103777, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00058985, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.1122164162891135, + "language_loss": 0.7399894, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.76183403, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.657200336456299 + }, + { + "auxiliary_loss_clip": 0.01084103, + "auxiliary_loss_mlp": 0.01102854, + "balance_loss_clip": 1.00149632, + "balance_loss_mlp": 1.0006206, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.6443541667158188, + "language_loss": 0.76331604, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78518569, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.732163190841675 + }, + { + "auxiliary_loss_clip": 0.01165066, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_clip": 1.00200355, + "balance_loss_mlp": 1.00054812, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.5762607106362163, + "language_loss": 0.67625958, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69892955, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 3.9637906551361084 + }, + { + "auxiliary_loss_clip": 0.01150317, + "auxiliary_loss_mlp": 0.0110362, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.0005281, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.3200366896041087, + "language_loss": 0.73729599, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75983536, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.599383592605591 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01102921, + "balance_loss_clip": 1.00173736, + "balance_loss_mlp": 1.0004015, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 1.9037632565397042, + "language_loss": 0.6673342, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68951404, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.6335253715515137 + }, + { + "auxiliary_loss_clip": 0.01165192, + "auxiliary_loss_mlp": 0.01103098, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.0005784, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.501108739301797, + "language_loss": 0.78680038, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80948329, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.5042312145233154 + }, + { + "auxiliary_loss_clip": 0.01165199, + "auxiliary_loss_mlp": 0.01103305, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00059402, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.3513549643771317, + "language_loss": 0.79194379, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81462884, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.4911115169525146 + }, + { + "auxiliary_loss_clip": 0.01148497, + "auxiliary_loss_mlp": 0.01103782, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.00049901, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.732306876758041, + "language_loss": 0.68567562, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70819843, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.5248467922210693 + }, + { + "auxiliary_loss_clip": 0.01135513, + "auxiliary_loss_mlp": 0.01103252, + "balance_loss_clip": 1.00171542, + "balance_loss_mlp": 1.00044632, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.028537133959017, + "language_loss": 0.64778173, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67016935, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.6122865676879883 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00180995, + "balance_loss_mlp": 1.00065017, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.6731472905690583, + "language_loss": 0.73342252, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75562477, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.571622371673584 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00038767, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.5894900906354013, + "language_loss": 0.77360684, + "learning_rate": 2.345478926864446e-07, + "loss": 0.79581022, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.6061151027679443 + }, + { + "auxiliary_loss_clip": 0.01148864, + "auxiliary_loss_mlp": 0.01103958, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.0004847, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 1.53373270789731, + "language_loss": 0.75842136, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.78094965, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.580256462097168 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01079406, + "balance_loss_clip": 1.00122941, + "balance_loss_mlp": 1.00006068, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8082042492172642, + "language_loss": 0.60120165, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.6230644, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.1833503246307373 + }, + { + "auxiliary_loss_clip": 0.01148443, + "auxiliary_loss_mlp": 0.01102977, + "balance_loss_clip": 1.00187695, + "balance_loss_mlp": 1.00064778, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 6.353697060779703, + "language_loss": 0.79613054, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81864476, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.5853822231292725 + }, + { + "auxiliary_loss_clip": 0.01148127, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00060987, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 1.8787410007531418, + "language_loss": 0.83191645, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85442424, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.535027265548706 + }, + { + "auxiliary_loss_clip": 0.01100579, + "auxiliary_loss_mlp": 0.01103258, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00054729, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.816097710952631, + "language_loss": 0.71615398, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73819232, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 5.549565076828003 + }, + { + "auxiliary_loss_clip": 0.01165227, + "auxiliary_loss_mlp": 0.01105113, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00058997, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.5078553143287747, + "language_loss": 0.73753875, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.76024216, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.5607118606567383 + }, + { + "auxiliary_loss_clip": 0.01101789, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00165534, + "balance_loss_mlp": 1.00047517, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.5187979011725687, + "language_loss": 0.6777398, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69978482, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 4.061531066894531 + }, + { + "auxiliary_loss_clip": 0.01118609, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00036156, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.827201543449848, + "language_loss": 0.69385248, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71251249, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.6131656169891357 + }, + { + "auxiliary_loss_clip": 0.01135812, + "auxiliary_loss_mlp": 0.01102968, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00063848, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 12.91559911449685, + "language_loss": 0.77866304, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.8010509, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.5605673789978027 + }, + { + "auxiliary_loss_clip": 0.0108415, + "auxiliary_loss_mlp": 0.01103564, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00047207, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 2.665003538412221, + "language_loss": 0.68007624, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70195341, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.6951401233673096 + }, + { + "auxiliary_loss_clip": 0.01147749, + "auxiliary_loss_mlp": 0.01103606, + "balance_loss_clip": 1.00163972, + "balance_loss_mlp": 1.00051403, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 2.207195945103598, + "language_loss": 0.7110703, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73358381, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.57136607170105 + }, + { + "auxiliary_loss_clip": 0.01116269, + "auxiliary_loss_mlp": 0.01102847, + "balance_loss_clip": 1.00154817, + "balance_loss_mlp": 1.00042272, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 2.555364226629057, + "language_loss": 0.68285227, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70504344, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.6483263969421387 + }, + { + "auxiliary_loss_clip": 0.01164949, + "auxiliary_loss_mlp": 0.01102935, + "balance_loss_clip": 1.00187123, + "balance_loss_mlp": 1.0006057, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.7230795233726597, + "language_loss": 0.70127493, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72395372, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.549454689025879 + }, + { + "auxiliary_loss_clip": 0.01100358, + "auxiliary_loss_mlp": 0.00746543, + "balance_loss_clip": 1.00126505, + "balance_loss_mlp": 1.00116408, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7283317182197481, + "language_loss": 0.57587558, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59434456, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.334418296813965 + }, + { + "auxiliary_loss_clip": 0.01117343, + "auxiliary_loss_mlp": 0.01103509, + "balance_loss_clip": 1.00175822, + "balance_loss_mlp": 1.00041652, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 1.9687154573666028, + "language_loss": 0.78811204, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81032062, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.640805959701538 + }, + { + "auxiliary_loss_clip": 0.0114859, + "auxiliary_loss_mlp": 0.01104238, + "balance_loss_clip": 1.00159311, + "balance_loss_mlp": 1.00057399, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 2.44832750221336, + "language_loss": 0.63351178, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65604007, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.5047926902770996 + }, + { + "auxiliary_loss_clip": 0.01148538, + "auxiliary_loss_mlp": 0.01104067, + "balance_loss_clip": 1.00191987, + "balance_loss_mlp": 1.00059366, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 1.8402295391077323, + "language_loss": 0.84057999, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86310607, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.5290942192077637 + }, + { + "auxiliary_loss_clip": 0.01118952, + "auxiliary_loss_mlp": 0.011026, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00055671, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 127.22713690238476, + "language_loss": 0.78392363, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80613917, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.63839054107666 + }, + { + "auxiliary_loss_clip": 0.01148273, + "auxiliary_loss_mlp": 0.01103236, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00071573, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.8070956366504347, + "language_loss": 0.64497197, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66748703, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.5218749046325684 + }, + { + "auxiliary_loss_clip": 0.01115112, + "auxiliary_loss_mlp": 0.01103636, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.0003531, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.636218042658317, + "language_loss": 0.70485175, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72703922, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.589388132095337 + }, + { + "auxiliary_loss_clip": 0.01117717, + "auxiliary_loss_mlp": 0.01104969, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00054145, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.3181972880239003, + "language_loss": 0.64610004, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66832697, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.6533796787261963 + }, + { + "auxiliary_loss_clip": 0.01116892, + "auxiliary_loss_mlp": 0.01103269, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00055861, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.4138934014274214, + "language_loss": 0.70868021, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.73088181, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.7457656860351562 + }, + { + "auxiliary_loss_clip": 0.01102347, + "auxiliary_loss_mlp": 0.01103225, + "balance_loss_clip": 1.00164223, + "balance_loss_mlp": 1.00051451, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 2.9968355715798256, + "language_loss": 0.65499598, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67705166, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.721606492996216 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.0074735, + "balance_loss_clip": 1.00161338, + "balance_loss_mlp": 1.00032306, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 1.7345065183365156, + "language_loss": 0.67835259, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69698644, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.6277897357940674 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.01102365, + "balance_loss_clip": 1.00166762, + "balance_loss_mlp": 1.00051308, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.088411048650071, + "language_loss": 0.65400922, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67584306, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 2.663404941558838 + }, + { + "auxiliary_loss_clip": 0.0113525, + "auxiliary_loss_mlp": 0.0110213, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00056374, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9555627541188854, + "language_loss": 0.8585574, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88093114, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.5581886768341064 + }, + { + "auxiliary_loss_clip": 0.01165012, + "auxiliary_loss_mlp": 0.01102599, + "balance_loss_clip": 1.00183713, + "balance_loss_mlp": 1.00046027, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.444493331246438, + "language_loss": 0.83679181, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85946792, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.4981772899627686 + }, + { + "auxiliary_loss_clip": 0.011506, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00052595, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.6537734515000877, + "language_loss": 0.85777128, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.88031721, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 2.4832394123077393 + }, + { + "auxiliary_loss_clip": 0.01133508, + "auxiliary_loss_mlp": 0.01102934, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00041401, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.793086761188789, + "language_loss": 0.71938527, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74174964, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.5742335319519043 + }, + { + "auxiliary_loss_clip": 0.01131119, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.00052798, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.7999892339045065, + "language_loss": 0.76101726, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78336132, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 4.019456624984741 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01103195, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00067484, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.7235290795594773, + "language_loss": 0.72405541, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74627912, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.59065842628479 + }, + { + "auxiliary_loss_clip": 0.01081228, + "auxiliary_loss_mlp": 0.01079318, + "balance_loss_clip": 1.00103807, + "balance_loss_mlp": 0.99997216, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8799993352002106, + "language_loss": 0.59634733, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61795282, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 3.027963161468506 + }, + { + "auxiliary_loss_clip": 0.01129442, + "auxiliary_loss_mlp": 0.01079683, + "balance_loss_clip": 1.00117254, + "balance_loss_mlp": 0.99995577, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6892729765869848, + "language_loss": 0.6117084, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63379967, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.163560152053833 + }, + { + "auxiliary_loss_clip": 0.01149762, + "auxiliary_loss_mlp": 0.01103229, + "balance_loss_clip": 1.00172889, + "balance_loss_mlp": 1.00051796, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.9668773303813583, + "language_loss": 0.80930585, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.83183575, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.5679736137390137 + }, + { + "auxiliary_loss_clip": 0.0111524, + "auxiliary_loss_mlp": 0.01101885, + "balance_loss_clip": 1.00156951, + "balance_loss_mlp": 1.00050986, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.635515180678476, + "language_loss": 0.79311502, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81528622, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 2.636091709136963 + }, + { + "auxiliary_loss_clip": 0.01114858, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_clip": 1.00163543, + "balance_loss_mlp": 1.00044107, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 2.323559774680473, + "language_loss": 0.70345008, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72563398, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.623192071914673 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01102662, + "balance_loss_clip": 1.00163567, + "balance_loss_mlp": 1.00052404, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.6841713271425227, + "language_loss": 0.73572707, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75790405, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.6516153812408447 + }, + { + "auxiliary_loss_clip": 0.01068614, + "auxiliary_loss_mlp": 0.01101917, + "balance_loss_clip": 1.00154448, + "balance_loss_mlp": 1.00044584, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 1.9003294713731576, + "language_loss": 0.79801643, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81972176, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.8130788803100586 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01103024, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00050449, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.0572844201148115, + "language_loss": 0.79129696, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81382799, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.592071294784546 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01103688, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00069153, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 1.6892668377977729, + "language_loss": 0.7120508, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73443639, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.5578536987304688 + }, + { + "auxiliary_loss_clip": 0.01150121, + "auxiliary_loss_mlp": 0.01104811, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00047863, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 3.7177147487460274, + "language_loss": 0.70428628, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72683561, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.6358401775360107 + }, + { + "auxiliary_loss_clip": 0.01150217, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_clip": 1.00177753, + "balance_loss_mlp": 1.00046992, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 1.9390658917786865, + "language_loss": 0.78219515, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80473584, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 3.949552297592163 + }, + { + "auxiliary_loss_clip": 0.01150411, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_clip": 1.00192893, + "balance_loss_mlp": 1.00059187, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 1.877642317662444, + "language_loss": 0.76527148, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.7878058, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 4.066485643386841 + }, + { + "auxiliary_loss_clip": 0.01128625, + "auxiliary_loss_mlp": 0.01079702, + "balance_loss_clip": 1.00128806, + "balance_loss_mlp": 0.99997503, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6933112909589462, + "language_loss": 0.55026507, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57234836, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 4.60131311416626 + }, + { + "auxiliary_loss_clip": 0.01149955, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00050747, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 5.2137500130312535, + "language_loss": 0.72541636, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74794614, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.5442006587982178 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01102574, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00053072, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.5361234935612365, + "language_loss": 0.67260849, + "learning_rate": 2.26200679088697e-07, + "loss": 0.6948061, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.63472580909729 + }, + { + "auxiliary_loss_clip": 0.01135188, + "auxiliary_loss_mlp": 0.01102681, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00054252, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.9420533823085617, + "language_loss": 0.73688281, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75926149, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.577523708343506 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.01103397, + "balance_loss_clip": 1.00189388, + "balance_loss_mlp": 1.00049531, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.6656457433743332, + "language_loss": 0.80420899, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82689345, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.535875082015991 + }, + { + "auxiliary_loss_clip": 0.01165052, + "auxiliary_loss_mlp": 0.01102345, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00044537, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 2.0427186480979294, + "language_loss": 0.76645631, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78913033, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 2.544332265853882 + }, + { + "auxiliary_loss_clip": 0.01165286, + "auxiliary_loss_mlp": 0.01104007, + "balance_loss_clip": 1.00201285, + "balance_loss_mlp": 1.00053382, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.6194305602693893, + "language_loss": 0.63684702, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65954, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.4887330532073975 + }, + { + "auxiliary_loss_clip": 0.01145586, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00036097, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.224434818698246, + "language_loss": 0.86739033, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88987213, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.4847891330718994 + }, + { + "auxiliary_loss_clip": 0.01150718, + "auxiliary_loss_mlp": 0.01103841, + "balance_loss_clip": 1.00203156, + "balance_loss_mlp": 1.00046313, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.9424324211292565, + "language_loss": 0.54753262, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.57007825, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.569484233856201 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.01102077, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00060594, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.8751877850564114, + "language_loss": 0.69742823, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71978575, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.540813684463501 + }, + { + "auxiliary_loss_clip": 0.01131544, + "auxiliary_loss_mlp": 0.00747373, + "balance_loss_clip": 1.00162172, + "balance_loss_mlp": 1.0004077, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.2215107803037184, + "language_loss": 0.76780957, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78659874, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.5783674716949463 + }, + { + "auxiliary_loss_clip": 0.01118905, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00040245, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.5352194724082262, + "language_loss": 0.81748605, + "learning_rate": 2.245841551883676e-07, + "loss": 0.8361485, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.6810972690582275 + }, + { + "auxiliary_loss_clip": 0.01165254, + "auxiliary_loss_mlp": 0.01103804, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00052071, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 3.164810284278876, + "language_loss": 0.65743393, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.68012446, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.4682281017303467 + }, + { + "auxiliary_loss_clip": 0.01133815, + "auxiliary_loss_mlp": 0.00747258, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00037861, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 2.444003234745527, + "language_loss": 0.78755629, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80636704, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.644002676010132 + }, + { + "auxiliary_loss_clip": 0.01131157, + "auxiliary_loss_mlp": 0.01103128, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00041723, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 2.5510040084300805, + "language_loss": 0.73138118, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.7537241, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.6365604400634766 + }, + { + "auxiliary_loss_clip": 0.01115184, + "auxiliary_loss_mlp": 0.01103896, + "balance_loss_clip": 1.00153112, + "balance_loss_mlp": 1.00070858, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.5907720927242353, + "language_loss": 0.7476027, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76979345, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 2.602330446243286 + }, + { + "auxiliary_loss_clip": 0.01164968, + "auxiliary_loss_mlp": 0.01102966, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00044584, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.155253333226249, + "language_loss": 0.8151499, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83782923, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.488309383392334 + }, + { + "auxiliary_loss_clip": 0.01102697, + "auxiliary_loss_mlp": 0.01103668, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.00057626, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.0435340935476587, + "language_loss": 0.61378765, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63585132, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.6919898986816406 + }, + { + "auxiliary_loss_clip": 0.01164992, + "auxiliary_loss_mlp": 0.01102042, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00057149, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.106743527269434, + "language_loss": 0.7243644, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74703479, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.462812662124634 + }, + { + "auxiliary_loss_clip": 0.01099804, + "auxiliary_loss_mlp": 0.01103169, + "balance_loss_clip": 1.00152647, + "balance_loss_mlp": 1.00055408, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.5045213031365454, + "language_loss": 0.71062708, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73265684, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.682675361633301 + }, + { + "auxiliary_loss_clip": 0.01133553, + "auxiliary_loss_mlp": 0.01103172, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00055623, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 1.8427154982794038, + "language_loss": 0.72572744, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74809462, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 2.5690364837646484 + }, + { + "auxiliary_loss_clip": 0.01165241, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_clip": 1.00205123, + "balance_loss_mlp": 1.0005722, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 3.2256670071333717, + "language_loss": 0.76884401, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.79152358, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.4905714988708496 + }, + { + "auxiliary_loss_clip": 0.01119288, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_clip": 1.00183368, + "balance_loss_mlp": 1.00033128, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 5.0861360567684235, + "language_loss": 0.79788929, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.82011831, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.584618330001831 + }, + { + "auxiliary_loss_clip": 0.01131645, + "auxiliary_loss_mlp": 0.01103341, + "balance_loss_clip": 1.00173116, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.6068066048689404, + "language_loss": 0.62874842, + "learning_rate": 2.224372736588449e-07, + "loss": 0.65109825, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 4.0070481300354 + }, + { + "auxiliary_loss_clip": 0.01098143, + "auxiliary_loss_mlp": 0.01104288, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.0005281, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.5864051001285815, + "language_loss": 0.764296, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78632033, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.721985340118408 + }, + { + "auxiliary_loss_clip": 0.01148594, + "auxiliary_loss_mlp": 0.01103218, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00050771, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.4519411671863944, + "language_loss": 0.78170025, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80421841, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.5764520168304443 + }, + { + "auxiliary_loss_clip": 0.01133992, + "auxiliary_loss_mlp": 0.01103331, + "balance_loss_clip": 1.00180137, + "balance_loss_mlp": 1.00042987, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 1.9455930390275247, + "language_loss": 0.79434699, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81672025, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.5650484561920166 + }, + { + "auxiliary_loss_clip": 0.01098375, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00159287, + "balance_loss_mlp": 1.00047064, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 1.823950898699457, + "language_loss": 0.7636342, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78564405, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.6444549560546875 + }, + { + "auxiliary_loss_clip": 0.01148393, + "auxiliary_loss_mlp": 0.01103278, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00047159, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.8308126299029883, + "language_loss": 0.68885124, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71136796, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.5331900119781494 + }, + { + "auxiliary_loss_clip": 0.01148581, + "auxiliary_loss_mlp": 0.01105248, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00062978, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.0773844567778452, + "language_loss": 0.62885052, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.65138882, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.5526468753814697 + }, + { + "auxiliary_loss_clip": 0.01130994, + "auxiliary_loss_mlp": 0.01103502, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00060081, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 1.833498757221182, + "language_loss": 0.76291955, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78526449, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.574324369430542 + }, + { + "auxiliary_loss_clip": 0.01165088, + "auxiliary_loss_mlp": 0.01103294, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00048804, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 1.7140007903533705, + "language_loss": 0.69314313, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71582699, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.498028516769409 + }, + { + "auxiliary_loss_clip": 0.01131771, + "auxiliary_loss_mlp": 0.01102913, + "balance_loss_clip": 1.00174308, + "balance_loss_mlp": 1.00039291, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 2.008776581401889, + "language_loss": 0.85899138, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88133824, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.5743205547332764 + }, + { + "auxiliary_loss_clip": 0.01128925, + "auxiliary_loss_mlp": 0.01080126, + "balance_loss_clip": 1.0011549, + "balance_loss_mlp": 1.00001705, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7573249662549488, + "language_loss": 0.55089885, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57298934, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.1066718101501465 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.00747333, + "balance_loss_clip": 1.00170732, + "balance_loss_mlp": 1.00033832, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.5017889530601192, + "language_loss": 0.81459075, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83322304, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.646793842315674 + }, + { + "auxiliary_loss_clip": 0.01164975, + "auxiliary_loss_mlp": 0.01102942, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00051785, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.354076410472982, + "language_loss": 0.68390322, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70658243, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.7437586784362793 + }, + { + "auxiliary_loss_clip": 0.01116261, + "auxiliary_loss_mlp": 0.01101276, + "balance_loss_clip": 1.0016371, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.756498672855823, + "language_loss": 0.86342973, + "learning_rate": 2.201224390669072e-07, + "loss": 0.8856051, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 4.2204365730285645 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.01103103, + "balance_loss_clip": 1.00164878, + "balance_loss_mlp": 1.00048721, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.6205621981493932, + "language_loss": 0.77644992, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79865354, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 4.125883340835571 + }, + { + "auxiliary_loss_clip": 0.01131575, + "auxiliary_loss_mlp": 0.01102601, + "balance_loss_clip": 1.00155401, + "balance_loss_mlp": 1.00046217, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.6387466668031867, + "language_loss": 0.69117695, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71351874, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.555269718170166 + }, + { + "auxiliary_loss_clip": 0.01148133, + "auxiliary_loss_mlp": 0.01102897, + "balance_loss_clip": 1.00172961, + "balance_loss_mlp": 1.00056791, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 2.367311253108812, + "language_loss": 0.80429167, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82680196, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 3.925307035446167 + }, + { + "auxiliary_loss_clip": 0.01135509, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_clip": 1.00185144, + "balance_loss_mlp": 1.0005703, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 2.283278664905455, + "language_loss": 0.66331375, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.68570924, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.6218273639678955 + }, + { + "auxiliary_loss_clip": 0.01165232, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00056219, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 1.9994980271188763, + "language_loss": 0.59327894, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61596781, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.5198168754577637 + }, + { + "auxiliary_loss_clip": 0.01131722, + "auxiliary_loss_mlp": 0.01103911, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.0004369, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.217007588076002, + "language_loss": 0.71932191, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.7416783, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 2.6666243076324463 + }, + { + "auxiliary_loss_clip": 0.01148786, + "auxiliary_loss_mlp": 0.01102678, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.0004437, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 2.647919404451649, + "language_loss": 0.76266873, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78518331, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.5021116733551025 + }, + { + "auxiliary_loss_clip": 0.01165145, + "auxiliary_loss_mlp": 0.01103647, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00055468, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.7348573130864504, + "language_loss": 0.85025585, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87294376, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.503033399581909 + }, + { + "auxiliary_loss_clip": 0.01134738, + "auxiliary_loss_mlp": 0.01102867, + "balance_loss_clip": 1.0017817, + "balance_loss_mlp": 1.00044274, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.3401514978464273, + "language_loss": 0.65877235, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68114841, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.6012070178985596 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01102598, + "balance_loss_clip": 1.00173688, + "balance_loss_mlp": 1.00045967, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 10.730786312456344, + "language_loss": 0.70501387, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.7270627, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.6893084049224854 + }, + { + "auxiliary_loss_clip": 0.0113367, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00048661, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.3829918490846191, + "language_loss": 0.69995105, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72231877, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.616609811782837 + }, + { + "auxiliary_loss_clip": 0.01133548, + "auxiliary_loss_mlp": 0.01103591, + "balance_loss_clip": 1.00171709, + "balance_loss_mlp": 1.00059426, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 1.9810117704622618, + "language_loss": 0.81500244, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83737391, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.555380344390869 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.01103337, + "balance_loss_clip": 1.00163782, + "balance_loss_mlp": 1.00043559, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 1.9484112081231975, + "language_loss": 0.6619857, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68420231, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.763782501220703 + }, + { + "auxiliary_loss_clip": 0.01164964, + "auxiliary_loss_mlp": 0.01102651, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00041687, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.693916381484986, + "language_loss": 0.78211391, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80479002, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.4866809844970703 + }, + { + "auxiliary_loss_clip": 0.01133216, + "auxiliary_loss_mlp": 0.01103891, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00041723, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 3.630294437792698, + "language_loss": 0.66434908, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68672007, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.6083028316497803 + }, + { + "auxiliary_loss_clip": 0.01165001, + "auxiliary_loss_mlp": 0.01102907, + "balance_loss_clip": 1.00186884, + "balance_loss_mlp": 1.00048292, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.7248931166464812, + "language_loss": 0.62165385, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64433295, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.6118478775024414 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01102745, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00041628, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.535193555907106, + "language_loss": 0.65366924, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67586744, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.601395606994629 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01103215, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00050402, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.4291484941937107, + "language_loss": 0.65037996, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67289424, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 2.5643911361694336 + }, + { + "auxiliary_loss_clip": 0.01150115, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.00182641, + "balance_loss_mlp": 1.00053859, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 2.3857580798070455, + "language_loss": 0.70315886, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72569156, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.5300214290618896 + }, + { + "auxiliary_loss_clip": 0.01133679, + "auxiliary_loss_mlp": 0.01104364, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00060463, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 5.319419854509031, + "language_loss": 0.67440963, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69679004, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.549313545227051 + }, + { + "auxiliary_loss_clip": 0.01164947, + "auxiliary_loss_mlp": 0.01102479, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00043547, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.151782589331243, + "language_loss": 0.71153837, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73421258, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.507420301437378 + }, + { + "auxiliary_loss_clip": 0.01117277, + "auxiliary_loss_mlp": 0.01103261, + "balance_loss_clip": 1.00172055, + "balance_loss_mlp": 1.00074148, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.6714894750190523, + "language_loss": 0.59932178, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62152719, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.6650514602661133 + }, + { + "auxiliary_loss_clip": 0.01135206, + "auxiliary_loss_mlp": 0.0110198, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00041318, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.5225185632724791, + "language_loss": 0.84186244, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86423427, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.6209778785705566 + }, + { + "auxiliary_loss_clip": 0.01148361, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00064456, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.5447716787220112, + "language_loss": 0.74057949, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76309186, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 2.544191598892212 + }, + { + "auxiliary_loss_clip": 0.01133494, + "auxiliary_loss_mlp": 0.00747301, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00035381, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.1345334515946712, + "language_loss": 0.75404829, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77285624, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 4.003441095352173 + }, + { + "auxiliary_loss_clip": 0.01086779, + "auxiliary_loss_mlp": 0.0110253, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00058186, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.5451396293770945, + "language_loss": 0.76924908, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79114211, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.7393057346343994 + }, + { + "auxiliary_loss_clip": 0.01165138, + "auxiliary_loss_mlp": 0.01104153, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00048828, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 1.7948503690349944, + "language_loss": 0.54232752, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56502044, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.479464054107666 + }, + { + "auxiliary_loss_clip": 0.01134107, + "auxiliary_loss_mlp": 0.0074738, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00028789, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 1.9283606717124244, + "language_loss": 0.65399206, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67280692, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.572601556777954 + }, + { + "auxiliary_loss_clip": 0.01115518, + "auxiliary_loss_mlp": 0.01103653, + "balance_loss_clip": 1.00165129, + "balance_loss_mlp": 1.00046587, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 1.8667504295149915, + "language_loss": 0.74036431, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76255608, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.5941359996795654 + }, + { + "auxiliary_loss_clip": 0.01148291, + "auxiliary_loss_mlp": 0.01103139, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00033319, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.6943102769374239, + "language_loss": 0.72788489, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.75039911, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.5472891330718994 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01103804, + "balance_loss_clip": 1.00188315, + "balance_loss_mlp": 1.00042582, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 2.211829610879548, + "language_loss": 0.82566339, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84820497, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.5247390270233154 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01103708, + "balance_loss_clip": 1.00195158, + "balance_loss_mlp": 1.00052071, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 2.1651126250592996, + "language_loss": 0.67763215, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70016921, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.558832883834839 + }, + { + "auxiliary_loss_clip": 0.01131882, + "auxiliary_loss_mlp": 0.01103618, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00043023, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.3363385892838235, + "language_loss": 0.67148471, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69383967, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.6066932678222656 + }, + { + "auxiliary_loss_clip": 0.01148643, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00048673, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.6738505017794358, + "language_loss": 0.76520205, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78771949, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.5434176921844482 + }, + { + "auxiliary_loss_clip": 0.01115178, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_clip": 1.00123286, + "balance_loss_mlp": 0.99999982, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7684748325174806, + "language_loss": 0.57964224, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60158747, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.194359540939331 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01079472, + "balance_loss_clip": 1.00112152, + "balance_loss_mlp": 1.00012612, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7773863428592814, + "language_loss": 0.56702125, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58910704, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.030008554458618 + }, + { + "auxiliary_loss_clip": 0.01131694, + "auxiliary_loss_mlp": 0.01103213, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00050199, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.6038672518001629, + "language_loss": 0.70052344, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.7228725, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.572652816772461 + }, + { + "auxiliary_loss_clip": 0.01133532, + "auxiliary_loss_mlp": 0.01101423, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00042915, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.5491719689155286, + "language_loss": 0.63661921, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65896881, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 4.043558120727539 + }, + { + "auxiliary_loss_clip": 0.01164862, + "auxiliary_loss_mlp": 0.01101487, + "balance_loss_clip": 1.00180614, + "balance_loss_mlp": 1.00049329, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.4611446880337973, + "language_loss": 0.69420153, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.716865, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.5026087760925293 + }, + { + "auxiliary_loss_clip": 0.0116514, + "auxiliary_loss_mlp": 0.01103761, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00047791, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 1.9429706996144873, + "language_loss": 0.66443753, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68712652, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 3.9771547317504883 + }, + { + "auxiliary_loss_clip": 0.01117389, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_clip": 1.00161219, + "balance_loss_mlp": 1.00042629, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 2.994636918120991, + "language_loss": 0.61862957, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64083958, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 4.133288621902466 + }, + { + "auxiliary_loss_clip": 0.01165215, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.0006851, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.4466464315323821, + "language_loss": 0.74525297, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76794863, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.571096181869507 + }, + { + "auxiliary_loss_clip": 0.01069857, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_clip": 1.00141084, + "balance_loss_mlp": 1.0006628, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 2.0791954636148535, + "language_loss": 0.76288724, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78463191, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.8029675483703613 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.00747274, + "balance_loss_clip": 1.00172961, + "balance_loss_mlp": 1.00030017, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.697735984452227, + "language_loss": 0.68235022, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70114219, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 2.7409892082214355 + }, + { + "auxiliary_loss_clip": 0.01145878, + "auxiliary_loss_mlp": 0.01079807, + "balance_loss_clip": 1.00117517, + "balance_loss_mlp": 1.00007999, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7612160175547514, + "language_loss": 0.5848192, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60707605, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 3.0597259998321533 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.0110497, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00044727, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.8842631527918807, + "language_loss": 0.77642334, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79895675, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.570894718170166 + }, + { + "auxiliary_loss_clip": 0.01135429, + "auxiliary_loss_mlp": 0.01103391, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00039458, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8505328207865905, + "language_loss": 0.81302732, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83541554, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.578994035720825 + }, + { + "auxiliary_loss_clip": 0.01131674, + "auxiliary_loss_mlp": 0.01102014, + "balance_loss_clip": 1.0017128, + "balance_loss_mlp": 1.00039995, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.7075592997869804, + "language_loss": 0.77171975, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79405665, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.5585482120513916 + }, + { + "auxiliary_loss_clip": 0.01101971, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_clip": 1.00161517, + "balance_loss_mlp": 1.00041401, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.759439792780842, + "language_loss": 0.77667189, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.79872, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.6786892414093018 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01102784, + "balance_loss_clip": 1.00187242, + "balance_loss_mlp": 1.00045514, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.7568315582413472, + "language_loss": 0.78480744, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80717432, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.5997095108032227 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01102911, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00048614, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.6501097791045105, + "language_loss": 0.79691726, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81928122, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.5624821186065674 + }, + { + "auxiliary_loss_clip": 0.01115117, + "auxiliary_loss_mlp": 0.01102139, + "balance_loss_clip": 1.00151372, + "balance_loss_mlp": 1.00047696, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.8804886093239415, + "language_loss": 0.61597461, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63814712, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 2.631028652191162 + }, + { + "auxiliary_loss_clip": 0.01133189, + "auxiliary_loss_mlp": 0.01104079, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00051045, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.6124178746357931, + "language_loss": 0.69570744, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71808016, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.6835103034973145 + }, + { + "auxiliary_loss_clip": 0.01128546, + "auxiliary_loss_mlp": 0.01079782, + "balance_loss_clip": 1.00111842, + "balance_loss_mlp": 1.00005436, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.8007813841887687, + "language_loss": 0.59205937, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61414266, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.228217124938965 + }, + { + "auxiliary_loss_clip": 0.01131388, + "auxiliary_loss_mlp": 0.0110229, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00034201, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.6636440788855427, + "language_loss": 0.81173342, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83407021, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.620326280593872 + }, + { + "auxiliary_loss_clip": 0.01164803, + "auxiliary_loss_mlp": 0.01102046, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00038421, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.1761527356416845, + "language_loss": 0.67558885, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69825733, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.515594482421875 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.01103591, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00040388, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.5268083112739848, + "language_loss": 0.69956386, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72208434, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.532975196838379 + }, + { + "auxiliary_loss_clip": 0.01118427, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00043488, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 45.34225644612489, + "language_loss": 0.76905107, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.7912659, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.712832450866699 + }, + { + "auxiliary_loss_clip": 0.01149912, + "auxiliary_loss_mlp": 0.00747153, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00040603, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.6768917095316977, + "language_loss": 0.67824781, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69721842, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.5613036155700684 + }, + { + "auxiliary_loss_clip": 0.0115045, + "auxiliary_loss_mlp": 0.01103213, + "balance_loss_clip": 1.00183904, + "balance_loss_mlp": 1.00059795, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.583155333105829, + "language_loss": 0.77104801, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79358464, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.560682773590088 + }, + { + "auxiliary_loss_clip": 0.01133144, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00042415, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.7401356882130068, + "language_loss": 0.74320495, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76557344, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.595287322998047 + }, + { + "auxiliary_loss_clip": 0.0114575, + "auxiliary_loss_mlp": 0.01103327, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.0004735, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.699005879414736, + "language_loss": 0.78936064, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81185144, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 4.010161399841309 + }, + { + "auxiliary_loss_clip": 0.01116982, + "auxiliary_loss_mlp": 0.0110236, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00041234, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.546290927398586, + "language_loss": 0.68215513, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70434856, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 2.63466477394104 + }, + { + "auxiliary_loss_clip": 0.01115094, + "auxiliary_loss_mlp": 0.00747341, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00036442, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.5528931751453405, + "language_loss": 0.7968809, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81550527, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.6406471729278564 + }, + { + "auxiliary_loss_clip": 0.01118728, + "auxiliary_loss_mlp": 0.01103737, + "balance_loss_clip": 1.001719, + "balance_loss_mlp": 1.00045371, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.7703895471808884, + "language_loss": 0.69665885, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71888345, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.7341904640197754 + }, + { + "auxiliary_loss_clip": 0.01165059, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00032723, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.649636457653899, + "language_loss": 0.66165525, + "learning_rate": 2.085464646918027e-07, + "loss": 0.6843257, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.518580198287964 + }, + { + "auxiliary_loss_clip": 0.01132728, + "auxiliary_loss_mlp": 0.01103114, + "balance_loss_clip": 1.00178206, + "balance_loss_mlp": 1.00049853, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.5692229164893048, + "language_loss": 0.75197524, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77433366, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.6282246112823486 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01102503, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00046015, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.6843345735042963, + "language_loss": 0.88140428, + "learning_rate": 2.082002873852946e-07, + "loss": 0.90391266, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.5163426399230957 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.01103934, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00055599, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.8916703217616726, + "language_loss": 0.72906446, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75158834, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.5412938594818115 + }, + { + "auxiliary_loss_clip": 0.01150481, + "auxiliary_loss_mlp": 0.01102591, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00035739, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.5891891083725287, + "language_loss": 0.66527188, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68780261, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.69225811958313 + }, + { + "auxiliary_loss_clip": 0.01135717, + "auxiliary_loss_mlp": 0.01102433, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.00039017, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.6199961790001298, + "language_loss": 0.73833954, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76072109, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.5751101970672607 + }, + { + "auxiliary_loss_clip": 0.01095816, + "auxiliary_loss_mlp": 0.0074654, + "balance_loss_clip": 1.00115085, + "balance_loss_mlp": 1.00109792, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7976310180857968, + "language_loss": 0.59500408, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.6134277, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.294609546661377 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01104124, + "balance_loss_clip": 1.00182903, + "balance_loss_mlp": 1.00045919, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.952203623173339, + "language_loss": 0.75691116, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77928782, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 2.5562002658843994 + }, + { + "auxiliary_loss_clip": 0.01148371, + "auxiliary_loss_mlp": 0.01102715, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00048089, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.6816593575088286, + "language_loss": 0.82128972, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.8438006, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.525515556335449 + }, + { + "auxiliary_loss_clip": 0.01143919, + "auxiliary_loss_mlp": 0.01079735, + "balance_loss_clip": 1.00106955, + "balance_loss_mlp": 1.00000763, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.9446458014035882, + "language_loss": 0.60849565, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.63073218, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.164377212524414 + }, + { + "auxiliary_loss_clip": 0.01148687, + "auxiliary_loss_mlp": 0.01104084, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00041962, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.1373584828870227, + "language_loss": 0.59389454, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.6164223, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.553215265274048 + }, + { + "auxiliary_loss_clip": 0.01131552, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00054073, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.925429141563869, + "language_loss": 0.76358652, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78593075, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 4.143714666366577 + }, + { + "auxiliary_loss_clip": 0.01131679, + "auxiliary_loss_mlp": 0.01103736, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.0005486, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.528993274107593, + "language_loss": 0.83529323, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85764742, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 5.473419666290283 + }, + { + "auxiliary_loss_clip": 0.01131636, + "auxiliary_loss_mlp": 0.01104839, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00060201, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.2165397577253434, + "language_loss": 0.74468392, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76704866, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.5607571601867676 + }, + { + "auxiliary_loss_clip": 0.01165008, + "auxiliary_loss_mlp": 0.01102515, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00056708, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.8319803696339276, + "language_loss": 0.66041285, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68308806, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.5134973526000977 + }, + { + "auxiliary_loss_clip": 0.01148286, + "auxiliary_loss_mlp": 0.01102864, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00053477, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.753107573846469, + "language_loss": 0.62401044, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64652193, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.5225648880004883 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.00747468, + "balance_loss_clip": 1.00150919, + "balance_loss_mlp": 1.00042069, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.7522507538775105, + "language_loss": 0.73184347, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75065553, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.537022590637207 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_clip": 1.00173116, + "balance_loss_mlp": 1.00046396, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 2.1119247833244894, + "language_loss": 0.74874675, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77110589, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 2.581319808959961 + }, + { + "auxiliary_loss_clip": 0.01150151, + "auxiliary_loss_mlp": 0.01103202, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00039601, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 2.1684196210668305, + "language_loss": 0.60133851, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62387204, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.63484525680542 + }, + { + "auxiliary_loss_clip": 0.01131798, + "auxiliary_loss_mlp": 0.01101698, + "balance_loss_clip": 1.00160742, + "balance_loss_mlp": 1.00051308, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.7566735819735597, + "language_loss": 0.75765383, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77998883, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.623499870300293 + }, + { + "auxiliary_loss_clip": 0.0114875, + "auxiliary_loss_mlp": 0.01103755, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.0006628, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 1.7571955330201914, + "language_loss": 0.74057257, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76309764, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.5346524715423584 + }, + { + "auxiliary_loss_clip": 0.01126665, + "auxiliary_loss_mlp": 0.0074644, + "balance_loss_clip": 1.00113702, + "balance_loss_mlp": 1.00108802, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 1.0327842019041993, + "language_loss": 0.49450725, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51323831, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.1582224369049072 + }, + { + "auxiliary_loss_clip": 0.01148316, + "auxiliary_loss_mlp": 0.01103113, + "balance_loss_clip": 1.00184429, + "balance_loss_mlp": 1.00059319, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.7926530674729022, + "language_loss": 0.79046685, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81298119, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.6060564517974854 + }, + { + "auxiliary_loss_clip": 0.01115955, + "auxiliary_loss_mlp": 0.01104525, + "balance_loss_clip": 1.00159311, + "balance_loss_mlp": 1.00057495, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.267262205007903, + "language_loss": 0.80709946, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82930434, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.632941484451294 + }, + { + "auxiliary_loss_clip": 0.01150358, + "auxiliary_loss_mlp": 0.01104207, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00063825, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 2.5860405891388853, + "language_loss": 0.64967328, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67221892, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.5250818729400635 + }, + { + "auxiliary_loss_clip": 0.01131724, + "auxiliary_loss_mlp": 0.01104104, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00053465, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.7795217943643533, + "language_loss": 0.55070132, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57305956, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.6730620861053467 + }, + { + "auxiliary_loss_clip": 0.01149977, + "auxiliary_loss_mlp": 0.0110316, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00054455, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 1.8461976335630812, + "language_loss": 0.714517, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73704839, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 2.514195680618286 + }, + { + "auxiliary_loss_clip": 0.01149368, + "auxiliary_loss_mlp": 0.01102797, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.00046825, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.419539715986647, + "language_loss": 0.71155488, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73407656, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.580291271209717 + }, + { + "auxiliary_loss_clip": 0.01131741, + "auxiliary_loss_mlp": 0.01102652, + "balance_loss_clip": 1.00177562, + "balance_loss_mlp": 1.0006088, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.5925975110793924, + "language_loss": 0.68742067, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70976454, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.561774253845215 + }, + { + "auxiliary_loss_clip": 0.01164935, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00055158, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 1.906794611906689, + "language_loss": 0.77550042, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.79817569, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.494910717010498 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01104013, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00063443, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 4.357561398568165, + "language_loss": 0.6901173, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71249747, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.5563716888427734 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01103567, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00056982, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.750501247528725, + "language_loss": 0.79308414, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81545818, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.6001453399658203 + }, + { + "auxiliary_loss_clip": 0.0115045, + "auxiliary_loss_mlp": 0.01102547, + "balance_loss_clip": 1.00186408, + "balance_loss_mlp": 1.00050366, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.4936143656767082, + "language_loss": 0.68175995, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70428991, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.595647096633911 + }, + { + "auxiliary_loss_clip": 0.01133652, + "auxiliary_loss_mlp": 0.01102562, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00061393, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 2.0868736162298265, + "language_loss": 0.68789488, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71025705, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.536559581756592 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01103054, + "balance_loss_clip": 1.00162995, + "balance_loss_mlp": 1.00062907, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.150389735235035, + "language_loss": 0.71671522, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73889536, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.701773166656494 + }, + { + "auxiliary_loss_clip": 0.01134802, + "auxiliary_loss_mlp": 0.01103057, + "balance_loss_clip": 1.00165558, + "balance_loss_mlp": 1.0005374, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.9378908381700128, + "language_loss": 0.68997896, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71235752, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 4.034108877182007 + }, + { + "auxiliary_loss_clip": 0.01101892, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.0016439, + "balance_loss_mlp": 1.00056589, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.639331264770318, + "language_loss": 0.74472702, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76677966, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.661494255065918 + }, + { + "auxiliary_loss_clip": 0.01148147, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00053561, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.6914453040507327, + "language_loss": 0.83843398, + "learning_rate": 2.02186225623733e-07, + "loss": 0.8609345, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.5635435581207275 + }, + { + "auxiliary_loss_clip": 0.0115032, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00066566, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.444494642019708, + "language_loss": 0.77316916, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.7957052, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 2.4949541091918945 + }, + { + "auxiliary_loss_clip": 0.01165085, + "auxiliary_loss_mlp": 0.01104176, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00051177, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 1.9901464588979993, + "language_loss": 0.53853095, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56122351, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.4570083618164062 + }, + { + "auxiliary_loss_clip": 0.01165122, + "auxiliary_loss_mlp": 0.01102881, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.00036097, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 1.949865678065557, + "language_loss": 0.84062946, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86330956, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 2.4642300605773926 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.00747359, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00033569, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.2700750821518287, + "language_loss": 0.71533704, + "learning_rate": 2.01504216561474e-07, + "loss": 0.7343123, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.6397273540496826 + }, + { + "auxiliary_loss_clip": 0.01150434, + "auxiliary_loss_mlp": 0.00747614, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00049305, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.7223415460956464, + "language_loss": 0.63734901, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65632951, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.5806243419647217 + }, + { + "auxiliary_loss_clip": 0.01129169, + "auxiliary_loss_mlp": 0.01079723, + "balance_loss_clip": 1.00111389, + "balance_loss_mlp": 0.99999589, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.618497873700733, + "language_loss": 0.48427933, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50636828, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.245295286178589 + }, + { + "auxiliary_loss_clip": 0.01083357, + "auxiliary_loss_mlp": 0.01104223, + "balance_loss_clip": 1.00159717, + "balance_loss_mlp": 1.00065351, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.7813139655638637, + "language_loss": 0.67241138, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69428724, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.6972646713256836 + }, + { + "auxiliary_loss_clip": 0.01087679, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.0015099, + "balance_loss_mlp": 1.00055242, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.785999131027393, + "language_loss": 0.78625298, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80815196, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.6903178691864014 + }, + { + "auxiliary_loss_clip": 0.0114819, + "auxiliary_loss_mlp": 0.01103192, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00057697, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 1.8284964627807614, + "language_loss": 0.71832019, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74083406, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.523715019226074 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01103244, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.0005331, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 2.3835423660099204, + "language_loss": 0.78162837, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80399811, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.5663795471191406 + }, + { + "auxiliary_loss_clip": 0.01135508, + "auxiliary_loss_mlp": 0.01103278, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00056684, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 3.8213735905005883, + "language_loss": 0.72864676, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75103462, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.67331600189209 + }, + { + "auxiliary_loss_clip": 0.01131378, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.00048089, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.827261479480109, + "language_loss": 0.68974674, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71208578, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 5.44618821144104 + }, + { + "auxiliary_loss_clip": 0.01148408, + "auxiliary_loss_mlp": 0.01102869, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00063539, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.710767696356993, + "language_loss": 0.72132921, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74384201, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 3.984093427658081 + }, + { + "auxiliary_loss_clip": 0.01131978, + "auxiliary_loss_mlp": 0.01103784, + "balance_loss_clip": 1.00194013, + "balance_loss_mlp": 1.00050116, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 2.2443182268381032, + "language_loss": 0.82865894, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.85101652, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.5859477519989014 + }, + { + "auxiliary_loss_clip": 0.01135112, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00059092, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.8206225089035688, + "language_loss": 0.67186773, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69424903, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.8422605991363525 + }, + { + "auxiliary_loss_clip": 0.01148383, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_clip": 1.00178814, + "balance_loss_mlp": 1.0005455, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.5118047593533195, + "language_loss": 0.71590304, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73841459, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.718235731124878 + }, + { + "auxiliary_loss_clip": 0.01133948, + "auxiliary_loss_mlp": 0.00747319, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00035715, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.9153929711323925, + "language_loss": 0.67088437, + "learning_rate": 1.992952252525839e-07, + "loss": 0.68969703, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.614276647567749 + }, + { + "auxiliary_loss_clip": 0.01133667, + "auxiliary_loss_mlp": 0.01103315, + "balance_loss_clip": 1.00159383, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.2304985651544587, + "language_loss": 0.80021781, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82258761, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.5728535652160645 + }, + { + "auxiliary_loss_clip": 0.01150133, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00041175, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.030071042330601, + "language_loss": 0.70878553, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.72776103, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.522400140762329 + }, + { + "auxiliary_loss_clip": 0.01133229, + "auxiliary_loss_mlp": 0.01103948, + "balance_loss_clip": 1.00176442, + "balance_loss_mlp": 1.0005697, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 1.912549287422251, + "language_loss": 0.56099546, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58336723, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.565096855163574 + }, + { + "auxiliary_loss_clip": 0.01116331, + "auxiliary_loss_mlp": 0.01102231, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00037813, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 3.635103350331375, + "language_loss": 0.75585222, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77803785, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 2.6229653358459473 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.01102657, + "balance_loss_clip": 1.00162733, + "balance_loss_mlp": 1.00042343, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 1.9793293261744835, + "language_loss": 0.66511393, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68715912, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.625150680541992 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.01103662, + "balance_loss_clip": 1.0017345, + "balance_loss_mlp": 1.0005697, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.5305902230351716, + "language_loss": 0.64477074, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66729081, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.541219711303711 + }, + { + "auxiliary_loss_clip": 0.01135687, + "auxiliary_loss_mlp": 0.01103584, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00058687, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 2.0992117913216966, + "language_loss": 0.84840035, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.8707931, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.561920642852783 + }, + { + "auxiliary_loss_clip": 0.01150294, + "auxiliary_loss_mlp": 0.01102266, + "balance_loss_clip": 1.00179207, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 1.9451905706617771, + "language_loss": 0.75111938, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77364492, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.559047222137451 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01102119, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.0003624, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.659905438357368, + "language_loss": 0.80098701, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82351172, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.584398031234741 + }, + { + "auxiliary_loss_clip": 0.01131877, + "auxiliary_loss_mlp": 0.01103104, + "balance_loss_clip": 1.00178564, + "balance_loss_mlp": 1.0005846, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.048382711763972, + "language_loss": 0.7711277, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79347754, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.617777109146118 + }, + { + "auxiliary_loss_clip": 0.01148226, + "auxiliary_loss_mlp": 0.01102706, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00056732, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 2.0387275317636995, + "language_loss": 0.65093565, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67344499, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.606126308441162 + }, + { + "auxiliary_loss_clip": 0.0113184, + "auxiliary_loss_mlp": 0.0110202, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00054908, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.6274383879123644, + "language_loss": 0.76080453, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.7831431, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 2.5806899070739746 + }, + { + "auxiliary_loss_clip": 0.01148608, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.00164998, + "balance_loss_mlp": 1.00055027, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 1.7680048947914457, + "language_loss": 0.66988266, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.6924004, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.5517659187316895 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01105513, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00051355, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.9002922672210822, + "language_loss": 0.61903346, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64142847, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.6964025497436523 + }, + { + "auxiliary_loss_clip": 0.01119943, + "auxiliary_loss_mlp": 0.01104975, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00064313, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.7965990785733954, + "language_loss": 0.6898278, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71207702, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.6108951568603516 + }, + { + "auxiliary_loss_clip": 0.01148373, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00051737, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 2.603921671758319, + "language_loss": 0.83068848, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85320735, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.5425570011138916 + }, + { + "auxiliary_loss_clip": 0.01165206, + "auxiliary_loss_mlp": 0.01103764, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00048089, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.5913847754211958, + "language_loss": 0.67481589, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69750559, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.505141496658325 + }, + { + "auxiliary_loss_clip": 0.01120619, + "auxiliary_loss_mlp": 0.01102712, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00057364, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.7522152640715476, + "language_loss": 0.67175961, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69399285, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.744847536087036 + }, + { + "auxiliary_loss_clip": 0.01131166, + "auxiliary_loss_mlp": 0.0110315, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 1.00063014, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 1.7298310055196349, + "language_loss": 0.62317151, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64551467, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 3.9961066246032715 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.00167286, + "balance_loss_mlp": 1.00047827, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.8106621657656305, + "language_loss": 0.62848401, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64729261, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.5439600944519043 + }, + { + "auxiliary_loss_clip": 0.01102551, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.0003593, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.5635071752374872, + "language_loss": 0.80362451, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82567024, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.662992238998413 + }, + { + "auxiliary_loss_clip": 0.01148313, + "auxiliary_loss_mlp": 0.0110231, + "balance_loss_clip": 1.00158596, + "balance_loss_mlp": 1.00055265, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 4.740618932002106, + "language_loss": 0.74714804, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76965427, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.58221435546875 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.0005101, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.9343998382631604, + "language_loss": 0.68316102, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70532584, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.600389003753662 + }, + { + "auxiliary_loss_clip": 0.01148282, + "auxiliary_loss_mlp": 0.01103429, + "balance_loss_clip": 1.00176299, + "balance_loss_mlp": 1.00062323, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.7019453118050238, + "language_loss": 0.67836916, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70088637, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 2.5188095569610596 + }, + { + "auxiliary_loss_clip": 0.01114771, + "auxiliary_loss_mlp": 0.01102916, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00049162, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.3730029375745616, + "language_loss": 0.81405032, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83622718, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.697112560272217 + }, + { + "auxiliary_loss_clip": 0.01149615, + "auxiliary_loss_mlp": 0.01103478, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00057697, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.6369390716115784, + "language_loss": 0.50635123, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52888215, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.6997416019439697 + }, + { + "auxiliary_loss_clip": 0.0107165, + "auxiliary_loss_mlp": 0.0110252, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00047708, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.7248471597742434, + "language_loss": 0.7521463, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77388799, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.8373141288757324 + }, + { + "auxiliary_loss_clip": 0.01133966, + "auxiliary_loss_mlp": 0.01103691, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00040805, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.855861308582012, + "language_loss": 0.80388272, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82625932, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.6198580265045166 + }, + { + "auxiliary_loss_clip": 0.01148072, + "auxiliary_loss_mlp": 0.01102057, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00039542, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.7421923103147574, + "language_loss": 0.65806192, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68056321, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.674720525741577 + }, + { + "auxiliary_loss_clip": 0.0115011, + "auxiliary_loss_mlp": 0.01103129, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.00060868, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.103426117685896, + "language_loss": 0.6992318, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72176421, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.50187087059021 + }, + { + "auxiliary_loss_clip": 0.01133988, + "auxiliary_loss_mlp": 0.01103641, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00064385, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7387413975498405, + "language_loss": 0.76741636, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.78979266, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.597445487976074 + }, + { + "auxiliary_loss_clip": 0.01148284, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.000543, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.8911899234480611, + "language_loss": 0.85018522, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87269676, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.576448917388916 + }, + { + "auxiliary_loss_clip": 0.01145585, + "auxiliary_loss_mlp": 0.01079699, + "balance_loss_clip": 1.00112963, + "balance_loss_mlp": 0.99997234, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.78606895169341, + "language_loss": 0.61914104, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.6413939, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 5.877326965332031 + }, + { + "auxiliary_loss_clip": 0.01164965, + "auxiliary_loss_mlp": 0.01102596, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00045729, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6246353583987374, + "language_loss": 0.81755346, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84022909, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 3.911358594894409 + }, + { + "auxiliary_loss_clip": 0.01135117, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00046754, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.955738313310322, + "language_loss": 0.85672987, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87910807, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.540893077850342 + }, + { + "auxiliary_loss_clip": 0.01114022, + "auxiliary_loss_mlp": 0.01103273, + "balance_loss_clip": 1.00185418, + "balance_loss_mlp": 1.00046706, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 2.336374343491112, + "language_loss": 0.58904004, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61121297, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.59613037109375 + }, + { + "auxiliary_loss_clip": 0.01099785, + "auxiliary_loss_mlp": 0.01103137, + "balance_loss_clip": 1.00157344, + "balance_loss_mlp": 1.00052178, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 2.88716187369915, + "language_loss": 0.77141899, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79344815, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.6252853870391846 + }, + { + "auxiliary_loss_clip": 0.01148503, + "auxiliary_loss_mlp": 0.01104125, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00046027, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.0763560734816053, + "language_loss": 0.77554059, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79806685, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 2.541682243347168 + }, + { + "auxiliary_loss_clip": 0.01120847, + "auxiliary_loss_mlp": 0.01103847, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00046921, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.2848969825346563, + "language_loss": 0.75110757, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77335453, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 2.7143170833587646 + }, + { + "auxiliary_loss_clip": 0.01086521, + "auxiliary_loss_mlp": 0.01102679, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00044537, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.9447557298907578, + "language_loss": 0.70306402, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72495604, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.6814322471618652 + }, + { + "auxiliary_loss_clip": 0.01115449, + "auxiliary_loss_mlp": 0.01104072, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00059855, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 3.8728963833227237, + "language_loss": 0.76211357, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78430879, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.606844425201416 + }, + { + "auxiliary_loss_clip": 0.01160427, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_clip": 1.00114536, + "balance_loss_mlp": 0.99994892, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.957042701249557, + "language_loss": 0.58861715, + "learning_rate": 1.922374222645329e-07, + "loss": 0.61101818, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.092062473297119 + }, + { + "auxiliary_loss_clip": 0.01066929, + "auxiliary_loss_mlp": 0.01104618, + "balance_loss_clip": 1.00138688, + "balance_loss_mlp": 1.0004766, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.604725456173966, + "language_loss": 0.80684894, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82856441, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.8259503841400146 + }, + { + "auxiliary_loss_clip": 0.01133246, + "auxiliary_loss_mlp": 0.01103158, + "balance_loss_clip": 1.00165796, + "balance_loss_mlp": 1.00054288, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.9244555003156183, + "language_loss": 0.72381932, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.7461834, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.612675666809082 + }, + { + "auxiliary_loss_clip": 0.01134857, + "auxiliary_loss_mlp": 0.01103239, + "balance_loss_clip": 1.00171208, + "balance_loss_mlp": 1.00043261, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.535956185418084, + "language_loss": 0.71586621, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73824716, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.6400809288024902 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01104319, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.0006547, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.0001505323847324, + "language_loss": 0.7129842, + "learning_rate": 1.915715498065993e-07, + "loss": 0.7353617, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.579345703125 + }, + { + "auxiliary_loss_clip": 0.01133328, + "auxiliary_loss_mlp": 0.01103131, + "balance_loss_clip": 1.0017978, + "balance_loss_mlp": 1.00042033, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.7579226376219694, + "language_loss": 0.81789649, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.8402611, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.5833346843719482 + }, + { + "auxiliary_loss_clip": 0.01131919, + "auxiliary_loss_mlp": 0.01104407, + "balance_loss_clip": 1.00170326, + "balance_loss_mlp": 1.0005517, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.436538967536309, + "language_loss": 0.62131488, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.64367807, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.572385549545288 + }, + { + "auxiliary_loss_clip": 0.01148458, + "auxiliary_loss_mlp": 0.01102816, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00048697, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.233796235342954, + "language_loss": 0.76363528, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78614795, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.583444833755493 + }, + { + "auxiliary_loss_clip": 0.01134037, + "auxiliary_loss_mlp": 0.01104334, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.00047898, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 2.2924279543127306, + "language_loss": 0.64589185, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66827559, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.5926766395568848 + }, + { + "auxiliary_loss_clip": 0.01087296, + "auxiliary_loss_mlp": 0.01102444, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00049615, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 2.043819243605042, + "language_loss": 0.66023743, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68213487, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 2.7058446407318115 + }, + { + "auxiliary_loss_clip": 0.01129013, + "auxiliary_loss_mlp": 0.01079683, + "balance_loss_clip": 1.00119734, + "balance_loss_mlp": 0.99995565, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8596252090721204, + "language_loss": 0.56887728, + "learning_rate": 1.905747985193107e-07, + "loss": 0.59096426, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 3.0176241397857666 + }, + { + "auxiliary_loss_clip": 0.01165043, + "auxiliary_loss_mlp": 0.01102754, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.0006156, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 2.5497023565087584, + "language_loss": 0.78857291, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81125087, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.556896924972534 + }, + { + "auxiliary_loss_clip": 0.01165082, + "auxiliary_loss_mlp": 0.01103015, + "balance_loss_clip": 1.00195587, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.7656225605662654, + "language_loss": 0.63795424, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.66063523, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.4684293270111084 + }, + { + "auxiliary_loss_clip": 0.01134104, + "auxiliary_loss_mlp": 0.01102486, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00063324, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.7071846281276284, + "language_loss": 0.77478182, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79714775, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.5437679290771484 + }, + { + "auxiliary_loss_clip": 0.01099646, + "auxiliary_loss_mlp": 0.00747253, + "balance_loss_clip": 1.00163984, + "balance_loss_mlp": 1.00030661, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.408769682037422, + "language_loss": 0.60472298, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62319201, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 4.4990925788879395 + }, + { + "auxiliary_loss_clip": 0.01114336, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.0005312, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.511403248748698, + "language_loss": 0.66395211, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68612409, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.5999391078948975 + }, + { + "auxiliary_loss_clip": 0.01133481, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.00062978, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.6812552837376467, + "language_loss": 0.70452017, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72688836, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.5722672939300537 + }, + { + "auxiliary_loss_clip": 0.01143889, + "auxiliary_loss_mlp": 0.01080117, + "balance_loss_clip": 1.00120687, + "balance_loss_mlp": 1.00000858, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8003020519384395, + "language_loss": 0.6029036, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62514377, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.127013683319092 + }, + { + "auxiliary_loss_clip": 0.01131246, + "auxiliary_loss_mlp": 0.0110315, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00063038, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.552441977952077, + "language_loss": 0.7421056, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.7644496, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.577810764312744 + }, + { + "auxiliary_loss_clip": 0.01136042, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00055265, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.1411388637085724, + "language_loss": 0.75359792, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77599955, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 2.5597641468048096 + }, + { + "auxiliary_loss_clip": 0.01131803, + "auxiliary_loss_mlp": 0.01102112, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00054574, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.3141028682753153, + "language_loss": 0.8450281, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86736721, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.537564754486084 + }, + { + "auxiliary_loss_clip": 0.01149525, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.0019027, + "balance_loss_mlp": 1.00047731, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.7081605483655988, + "language_loss": 0.75535214, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.7778784, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.54026460647583 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01102707, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00056887, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.7882285964915448, + "language_loss": 0.851542, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87390375, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 2.555990219116211 + }, + { + "auxiliary_loss_clip": 0.01147677, + "auxiliary_loss_mlp": 0.01102479, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00053072, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.6976114629793015, + "language_loss": 0.81171083, + "learning_rate": 1.884236463176072e-07, + "loss": 0.83421242, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.5216195583343506 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01104469, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00051856, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.101504354064586, + "language_loss": 0.72949463, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.7518757, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.631317138671875 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01104046, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.00057268, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.815583478375467, + "language_loss": 0.82049453, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84303606, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.5051522254943848 + }, + { + "auxiliary_loss_clip": 0.01164903, + "auxiliary_loss_mlp": 0.01102757, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00052273, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.9946798030568331, + "language_loss": 0.68606138, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70873797, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.487940549850464 + }, + { + "auxiliary_loss_clip": 0.01115422, + "auxiliary_loss_mlp": 0.01102417, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00065994, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.6056726076109429, + "language_loss": 0.9046526, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92683095, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.652766466140747 + }, + { + "auxiliary_loss_clip": 0.01099066, + "auxiliary_loss_mlp": 0.00747277, + "balance_loss_clip": 1.00163913, + "balance_loss_mlp": 1.00042844, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.4586613966060202, + "language_loss": 0.7090708, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72753423, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 4.096835136413574 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00058639, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.6434396832137868, + "language_loss": 0.8230474, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84573132, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 3.9079906940460205 + }, + { + "auxiliary_loss_clip": 0.01113867, + "auxiliary_loss_mlp": 0.01079739, + "balance_loss_clip": 1.00144386, + "balance_loss_mlp": 1.00001168, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7887581928455104, + "language_loss": 0.68040073, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70233679, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.08158540725708 + }, + { + "auxiliary_loss_clip": 0.0115042, + "auxiliary_loss_mlp": 0.01104794, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.0005579, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 1.8814331175207366, + "language_loss": 0.75590444, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77845657, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.540374994277954 + }, + { + "auxiliary_loss_clip": 0.01133724, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_clip": 1.0017314, + "balance_loss_mlp": 1.00047648, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 1.9430643193531054, + "language_loss": 0.73951328, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76187861, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.568967819213867 + }, + { + "auxiliary_loss_clip": 0.01150409, + "auxiliary_loss_mlp": 0.01103251, + "balance_loss_clip": 1.00186098, + "balance_loss_mlp": 1.00044489, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.7423176845050954, + "language_loss": 0.64832163, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67085826, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.8353261947631836 + }, + { + "auxiliary_loss_clip": 0.01150557, + "auxiliary_loss_mlp": 0.01104324, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00046849, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.5986444838923342, + "language_loss": 0.67680031, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.69934916, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.5553171634674072 + }, + { + "auxiliary_loss_clip": 0.01148568, + "auxiliary_loss_mlp": 0.0110303, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00051022, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 1.9779720710765454, + "language_loss": 0.69587338, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71838939, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.558151960372925 + }, + { + "auxiliary_loss_clip": 0.01135137, + "auxiliary_loss_mlp": 0.01102784, + "balance_loss_clip": 1.00170565, + "balance_loss_mlp": 1.00045466, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 2.1006242937315505, + "language_loss": 0.6360516, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65843081, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.58130145072937 + }, + { + "auxiliary_loss_clip": 0.01118578, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00038373, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 1.8758065525070402, + "language_loss": 0.76206207, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78427696, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.6082005500793457 + }, + { + "auxiliary_loss_clip": 0.01148496, + "auxiliary_loss_mlp": 0.01102815, + "balance_loss_clip": 1.00180387, + "balance_loss_mlp": 1.00039065, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.1048964887674524, + "language_loss": 0.93126357, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95377672, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.491168975830078 + }, + { + "auxiliary_loss_clip": 0.01083446, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00062525, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.958865894214007, + "language_loss": 0.67318022, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69504517, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.749053955078125 + }, + { + "auxiliary_loss_clip": 0.01148202, + "auxiliary_loss_mlp": 0.01103292, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.00048637, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.1000731987339463, + "language_loss": 0.73530781, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75782281, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.5086982250213623 + }, + { + "auxiliary_loss_clip": 0.01067549, + "auxiliary_loss_mlp": 0.0110199, + "balance_loss_clip": 1.00166464, + "balance_loss_mlp": 1.00042379, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.5736281534805647, + "language_loss": 0.74700159, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.76869696, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.72194766998291 + }, + { + "auxiliary_loss_clip": 0.0113389, + "auxiliary_loss_mlp": 0.01104505, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.0005542, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 3.2938809507841893, + "language_loss": 0.73261052, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75499445, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.6064770221710205 + }, + { + "auxiliary_loss_clip": 0.01116744, + "auxiliary_loss_mlp": 0.01102731, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00049663, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.9511803587158298, + "language_loss": 0.70422494, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72641969, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.6326022148132324 + }, + { + "auxiliary_loss_clip": 0.01148536, + "auxiliary_loss_mlp": 0.00747442, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.00042272, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 2.6958573492987616, + "language_loss": 0.66366625, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68262601, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.5233898162841797 + }, + { + "auxiliary_loss_clip": 0.0113358, + "auxiliary_loss_mlp": 0.01103014, + "balance_loss_clip": 1.00170612, + "balance_loss_mlp": 1.00049448, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.5532411989452175, + "language_loss": 0.82752752, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.84989345, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.5833444595336914 + }, + { + "auxiliary_loss_clip": 0.01148228, + "auxiliary_loss_mlp": 0.01103174, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00065422, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.6198178598489392, + "language_loss": 0.70003015, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72254419, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.5477135181427 + }, + { + "auxiliary_loss_clip": 0.01148473, + "auxiliary_loss_mlp": 0.01103351, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00054502, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.7806129804226938, + "language_loss": 0.77371156, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79622984, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.52604603767395 + }, + { + "auxiliary_loss_clip": 0.0114845, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00054109, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.8227466224326017, + "language_loss": 0.77107269, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79359829, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.543351650238037 + }, + { + "auxiliary_loss_clip": 0.01117567, + "auxiliary_loss_mlp": 0.01103595, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00059819, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.8204206774995992, + "language_loss": 0.7756803, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79789197, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.589604377746582 + }, + { + "auxiliary_loss_clip": 0.01131254, + "auxiliary_loss_mlp": 0.01102556, + "balance_loss_clip": 1.00145555, + "balance_loss_mlp": 1.00060797, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 2.968221304056725, + "language_loss": 0.73818803, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76052618, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.5257720947265625 + }, + { + "auxiliary_loss_clip": 0.01149967, + "auxiliary_loss_mlp": 0.00747233, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00043321, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.7737880497891712, + "language_loss": 0.69551659, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71448863, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.5438051223754883 + }, + { + "auxiliary_loss_clip": 0.01148289, + "auxiliary_loss_mlp": 0.01102662, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.00052345, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.085460581737586, + "language_loss": 0.62864995, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65115941, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 3.9900810718536377 + }, + { + "auxiliary_loss_clip": 0.01116505, + "auxiliary_loss_mlp": 0.00747297, + "balance_loss_clip": 1.00169837, + "balance_loss_mlp": 1.00033879, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.6729811855471732, + "language_loss": 0.63647985, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65511787, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.6462454795837402 + }, + { + "auxiliary_loss_clip": 0.01111468, + "auxiliary_loss_mlp": 0.01079817, + "balance_loss_clip": 1.00122845, + "balance_loss_mlp": 1.00008988, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.799809059814642, + "language_loss": 0.60423511, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62614793, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.2400455474853516 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.00747419, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.0004003, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.8615907717954356, + "language_loss": 0.74367249, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76263416, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.528249502182007 + }, + { + "auxiliary_loss_clip": 0.01131734, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.00059462, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.4695184486670756, + "language_loss": 0.74896395, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77131248, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.564987897872925 + }, + { + "auxiliary_loss_clip": 0.01148257, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00058818, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.4449191851280634, + "language_loss": 0.6816994, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.7042011, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 2.550351619720459 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_clip": 1.00167847, + "balance_loss_mlp": 1.00049365, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.8227996044263441, + "language_loss": 0.78993392, + "learning_rate": 1.826898250065465e-07, + "loss": 0.8124615, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.5055644512176514 + }, + { + "auxiliary_loss_clip": 0.01150329, + "auxiliary_loss_mlp": 0.0110251, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00056255, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5177245925839675, + "language_loss": 0.83392501, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85645342, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.520953893661499 + }, + { + "auxiliary_loss_clip": 0.01127933, + "auxiliary_loss_mlp": 0.01079321, + "balance_loss_clip": 1.00139475, + "balance_loss_mlp": 0.99997568, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7071381728621056, + "language_loss": 0.49075851, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51283103, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.165480375289917 + }, + { + "auxiliary_loss_clip": 0.01131883, + "auxiliary_loss_mlp": 0.00747238, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00040603, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.5227436804282108, + "language_loss": 0.73431301, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75310427, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.618324041366577 + }, + { + "auxiliary_loss_clip": 0.01118651, + "auxiliary_loss_mlp": 0.01101144, + "balance_loss_clip": 1.00176179, + "balance_loss_mlp": 1.00043643, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.5570043400152642, + "language_loss": 0.76726508, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78946304, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.601259231567383 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01102318, + "balance_loss_clip": 1.00170422, + "balance_loss_mlp": 1.00056088, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.8058762409372489, + "language_loss": 0.71479213, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73699939, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.6625750064849854 + }, + { + "auxiliary_loss_clip": 0.01148427, + "auxiliary_loss_mlp": 0.01104103, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.0005343, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.55319785738967, + "language_loss": 0.68250513, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70503038, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.543036937713623 + }, + { + "auxiliary_loss_clip": 0.01113025, + "auxiliary_loss_mlp": 0.01103919, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.0004456, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 1.673591686448474, + "language_loss": 0.70771182, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72988129, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 3.9921133518218994 + }, + { + "auxiliary_loss_clip": 0.01119569, + "auxiliary_loss_mlp": 0.01102591, + "balance_loss_clip": 1.00176024, + "balance_loss_mlp": 1.00045252, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.4567176405724713, + "language_loss": 0.68321884, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70544046, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.662984848022461 + }, + { + "auxiliary_loss_clip": 0.01131733, + "auxiliary_loss_mlp": 0.01102545, + "balance_loss_clip": 1.00169051, + "balance_loss_mlp": 1.00050187, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.912000880745925, + "language_loss": 0.70333374, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72567648, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 5.39457106590271 + }, + { + "auxiliary_loss_clip": 0.01132589, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00046372, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.9834935618431169, + "language_loss": 0.66626132, + "learning_rate": 1.810670840677151e-07, + "loss": 0.6886189, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.581533432006836 + }, + { + "auxiliary_loss_clip": 0.01100222, + "auxiliary_loss_mlp": 0.01102681, + "balance_loss_clip": 1.00166345, + "balance_loss_mlp": 1.00063753, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 1.7803658218890754, + "language_loss": 0.68798727, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71001631, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.666438579559326 + }, + { + "auxiliary_loss_clip": 0.01150389, + "auxiliary_loss_mlp": 0.01103183, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00066257, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.0587976356843347, + "language_loss": 0.63406414, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65659988, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.4954190254211426 + }, + { + "auxiliary_loss_clip": 0.01149486, + "auxiliary_loss_mlp": 0.01103137, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00061691, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 2.1162085772201658, + "language_loss": 0.78294206, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80546832, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 2.504667043685913 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01079735, + "balance_loss_clip": 1.00119257, + "balance_loss_mlp": 1.00000751, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7028904973435237, + "language_loss": 0.58441889, + "learning_rate": 1.804199186231805e-07, + "loss": 0.6065051, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 3.229196071624756 + }, + { + "auxiliary_loss_clip": 0.01133326, + "auxiliary_loss_mlp": 0.01102005, + "balance_loss_clip": 1.00161791, + "balance_loss_mlp": 1.00053453, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 2.4237360253021847, + "language_loss": 0.80061305, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82296634, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.6492388248443604 + }, + { + "auxiliary_loss_clip": 0.01133045, + "auxiliary_loss_mlp": 0.00747535, + "balance_loss_clip": 1.00165272, + "balance_loss_mlp": 1.00045586, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 2.2770735737243832, + "language_loss": 0.62047249, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63927829, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.685176134109497 + }, + { + "auxiliary_loss_clip": 0.01131486, + "auxiliary_loss_mlp": 0.01103117, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00050163, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.457510328548, + "language_loss": 0.70504296, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72738898, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.55574893951416 + }, + { + "auxiliary_loss_clip": 0.01116415, + "auxiliary_loss_mlp": 0.01103072, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00045669, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 1.955117688805923, + "language_loss": 0.80818415, + "learning_rate": 1.797738571571381e-07, + "loss": 0.83037901, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.655263900756836 + }, + { + "auxiliary_loss_clip": 0.01148337, + "auxiliary_loss_mlp": 0.01101771, + "balance_loss_clip": 1.00152206, + "balance_loss_mlp": 1.00049067, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.7242656288486333, + "language_loss": 0.67390752, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69640863, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.5201358795166016 + }, + { + "auxiliary_loss_clip": 0.01148224, + "auxiliary_loss_mlp": 0.0110238, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.0005275, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.4191046809545567, + "language_loss": 0.6397512, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.66225731, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.681891679763794 + }, + { + "auxiliary_loss_clip": 0.01150149, + "auxiliary_loss_mlp": 0.0110308, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00065553, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 2.5809939427824697, + "language_loss": 0.6581679, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68070024, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.6337788105010986 + }, + { + "auxiliary_loss_clip": 0.01148557, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00049686, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.4479415167687186, + "language_loss": 0.66303855, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68555242, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.6307945251464844 + }, + { + "auxiliary_loss_clip": 0.01133941, + "auxiliary_loss_mlp": 0.01104363, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00050831, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 2.010599149231833, + "language_loss": 0.72719437, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74957734, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.547382354736328 + }, + { + "auxiliary_loss_clip": 0.01164923, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00045347, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.660035696910194, + "language_loss": 0.83443987, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85712171, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.5383780002593994 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01103038, + "balance_loss_clip": 1.00182593, + "balance_loss_mlp": 1.00051832, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 2.057438752521337, + "language_loss": 0.77221382, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79439622, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.599238395690918 + }, + { + "auxiliary_loss_clip": 0.01147839, + "auxiliary_loss_mlp": 0.01103765, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00048184, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.888359689240164, + "language_loss": 0.67614567, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69866163, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.53334379196167 + }, + { + "auxiliary_loss_clip": 0.0115028, + "auxiliary_loss_mlp": 0.01102692, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00036252, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.5946781019581646, + "language_loss": 0.83269906, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85522878, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.553384780883789 + }, + { + "auxiliary_loss_clip": 0.01068941, + "auxiliary_loss_mlp": 0.0110207, + "balance_loss_clip": 1.00141191, + "balance_loss_mlp": 1.00031304, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.6124306070798473, + "language_loss": 0.74064487, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76235497, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.750694990158081 + }, + { + "auxiliary_loss_clip": 0.01133413, + "auxiliary_loss_mlp": 0.01102613, + "balance_loss_clip": 1.0017606, + "balance_loss_mlp": 1.00037932, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.939459132774364, + "language_loss": 0.80638099, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82874125, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.5522241592407227 + }, + { + "auxiliary_loss_clip": 0.01112601, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.00111628, + "balance_loss_mlp": 1.00008285, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8995448510680818, + "language_loss": 0.60569543, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62761956, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.0996954441070557 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_clip": 1.00175202, + "balance_loss_mlp": 1.00040781, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6516910448500193, + "language_loss": 0.76091015, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78315318, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.6380393505096436 + }, + { + "auxiliary_loss_clip": 0.01148207, + "auxiliary_loss_mlp": 0.01103147, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.0003407, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.46696227481563, + "language_loss": 0.72325999, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74577355, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 3.9037985801696777 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.00747326, + "balance_loss_clip": 1.00165343, + "balance_loss_mlp": 1.0003562, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.989900717663703, + "language_loss": 0.72412652, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74293381, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 2.5793991088867188 + }, + { + "auxiliary_loss_clip": 0.01150425, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00057352, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 4.200119008248616, + "language_loss": 0.73956001, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.76208758, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.5085015296936035 + }, + { + "auxiliary_loss_clip": 0.01165142, + "auxiliary_loss_mlp": 0.01103341, + "balance_loss_clip": 1.00206912, + "balance_loss_mlp": 1.00044012, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 2.966124492992975, + "language_loss": 0.59294385, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61562866, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.623318672180176 + }, + { + "auxiliary_loss_clip": 0.01133509, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.0004667, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.062894654009373, + "language_loss": 0.80265903, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82502115, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.5407888889312744 + }, + { + "auxiliary_loss_clip": 0.01085989, + "auxiliary_loss_mlp": 0.01104875, + "balance_loss_clip": 1.00169671, + "balance_loss_mlp": 1.00044799, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 3.523712428621309, + "language_loss": 0.74501604, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76692462, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.7044758796691895 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.01102655, + "balance_loss_clip": 1.00185311, + "balance_loss_mlp": 1.00042105, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5915397360044377, + "language_loss": 0.78155464, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80358672, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.7138001918792725 + }, + { + "auxiliary_loss_clip": 0.01149828, + "auxiliary_loss_mlp": 0.01103145, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00052989, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.7474988091045487, + "language_loss": 0.70679092, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.72932065, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.5290486812591553 + }, + { + "auxiliary_loss_clip": 0.01133531, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.0018059, + "balance_loss_mlp": 1.00053549, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.3310854516197639, + "language_loss": 0.73813719, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76049161, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.6327896118164062 + }, + { + "auxiliary_loss_clip": 0.01133344, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_clip": 1.00176418, + "balance_loss_mlp": 1.0005461, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.8949757503907236, + "language_loss": 0.64915878, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67152381, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.6017160415649414 + }, + { + "auxiliary_loss_clip": 0.01150198, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00178289, + "balance_loss_mlp": 1.00063348, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 1.980325970095631, + "language_loss": 0.82963979, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.85216951, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.489546775817871 + }, + { + "auxiliary_loss_clip": 0.01150349, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00048125, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 2.5413410043630886, + "language_loss": 0.65488636, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67742848, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.5095577239990234 + }, + { + "auxiliary_loss_clip": 0.01132785, + "auxiliary_loss_mlp": 0.01104016, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00054288, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 1.8353120015459885, + "language_loss": 0.6641655, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68653345, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.5508997440338135 + }, + { + "auxiliary_loss_clip": 0.0113154, + "auxiliary_loss_mlp": 0.01103248, + "balance_loss_clip": 1.00166404, + "balance_loss_mlp": 1.00044227, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 1.9711671560285964, + "language_loss": 0.6273545, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64970231, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 4.087656736373901 + }, + { + "auxiliary_loss_clip": 0.01148096, + "auxiliary_loss_mlp": 0.01102019, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00054789, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.4028860033911217, + "language_loss": 0.84959698, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.87209809, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.5449166297912598 + }, + { + "auxiliary_loss_clip": 0.01134257, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.00061011, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 3.0008911605393176, + "language_loss": 0.61890405, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64129317, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 5.409346103668213 + }, + { + "auxiliary_loss_clip": 0.01164715, + "auxiliary_loss_mlp": 0.01101764, + "balance_loss_clip": 1.00173879, + "balance_loss_mlp": 1.00038838, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.2889846182330853, + "language_loss": 0.68882966, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71149445, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.587392568588257 + }, + { + "auxiliary_loss_clip": 0.0113179, + "auxiliary_loss_mlp": 0.01102248, + "balance_loss_clip": 1.00165975, + "balance_loss_mlp": 1.0004909, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.651900067310297, + "language_loss": 0.71054721, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73288757, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 2.6208040714263916 + }, + { + "auxiliary_loss_clip": 0.01147701, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.00049734, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 3.8103007011528915, + "language_loss": 0.83930302, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86179876, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.508105754852295 + }, + { + "auxiliary_loss_clip": 0.01132789, + "auxiliary_loss_mlp": 0.01102869, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.00044405, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.7588394975415105, + "language_loss": 0.73014677, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75250334, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.577772378921509 + }, + { + "auxiliary_loss_clip": 0.01164919, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00040317, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.5418602245708761, + "language_loss": 0.78925157, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81192523, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.517103672027588 + }, + { + "auxiliary_loss_clip": 0.01148708, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00039065, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 1.7699951969109742, + "language_loss": 0.72607183, + "learning_rate": 1.741679706279644e-07, + "loss": 0.7450332, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.525001287460327 + }, + { + "auxiliary_loss_clip": 0.01165068, + "auxiliary_loss_mlp": 0.01102715, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00048113, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.4942545509539111, + "language_loss": 0.72645718, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74913502, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.5404460430145264 + }, + { + "auxiliary_loss_clip": 0.01131545, + "auxiliary_loss_mlp": 0.01102752, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00051832, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 5.88174269027516, + "language_loss": 0.66989696, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69223994, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.624985933303833 + }, + { + "auxiliary_loss_clip": 0.01164985, + "auxiliary_loss_mlp": 0.01102723, + "balance_loss_clip": 1.00188649, + "balance_loss_mlp": 1.00039411, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.681055232125899, + "language_loss": 0.78040308, + "learning_rate": 1.736914088262349e-07, + "loss": 0.80308014, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149991, + "auxiliary_loss_mlp": 0.01102577, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.00034308, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.6216155359659463, + "language_loss": 0.7233001, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74582577, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.558577060699463 + }, + { + "auxiliary_loss_clip": 0.01149341, + "auxiliary_loss_mlp": 0.01103029, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00041318, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.6876884851819973, + "language_loss": 0.59193289, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61445659, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.4869940280914307 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01102245, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00058377, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.545783826570323, + "language_loss": 0.71337616, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73590291, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.545415163040161 + }, + { + "auxiliary_loss_clip": 0.01131554, + "auxiliary_loss_mlp": 0.01102657, + "balance_loss_clip": 1.00172651, + "balance_loss_mlp": 1.00042343, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.466214927480218, + "language_loss": 0.70792961, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.7302717, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.6177890300750732 + }, + { + "auxiliary_loss_clip": 0.01101937, + "auxiliary_loss_mlp": 0.01102829, + "balance_loss_clip": 1.00153995, + "balance_loss_mlp": 1.00040412, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.554448249965348, + "language_loss": 0.69853765, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72058535, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.708178758621216 + }, + { + "auxiliary_loss_clip": 0.01147999, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.00042284, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.5537266708239634, + "language_loss": 0.76702297, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.78952384, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.580636978149414 + }, + { + "auxiliary_loss_clip": 0.01132582, + "auxiliary_loss_mlp": 0.01102949, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.000525, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.8775524161747656, + "language_loss": 0.76679599, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78915131, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.5504167079925537 + }, + { + "auxiliary_loss_clip": 0.01150416, + "auxiliary_loss_mlp": 0.01103777, + "balance_loss_clip": 1.00192487, + "balance_loss_mlp": 1.00049448, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.0935265808889967, + "language_loss": 0.61883497, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64137691, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.529231071472168 + }, + { + "auxiliary_loss_clip": 0.01165149, + "auxiliary_loss_mlp": 0.01103615, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00052297, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.6741483430165487, + "language_loss": 0.68109858, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70378619, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.4765424728393555 + }, + { + "auxiliary_loss_clip": 0.01085461, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.00165546, + "balance_loss_mlp": 1.00038958, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.8102668895861662, + "language_loss": 0.63362813, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.65195668, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.7614786624908447 + }, + { + "auxiliary_loss_clip": 0.01165055, + "auxiliary_loss_mlp": 0.01104261, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00050104, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.396647177795972, + "language_loss": 0.61639804, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63909125, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.5974271297454834 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.00747195, + "balance_loss_clip": 1.00168049, + "balance_loss_mlp": 1.00041759, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.8861198714961749, + "language_loss": 0.67711288, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69592112, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.5681166648864746 + }, + { + "auxiliary_loss_clip": 0.01133549, + "auxiliary_loss_mlp": 0.00747098, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00031376, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 2.364304405390263, + "language_loss": 0.8583672, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87717366, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.5554521083831787 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.01103446, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.0004493, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.421360528632258, + "language_loss": 0.75259113, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77511066, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 4.159968376159668 + }, + { + "auxiliary_loss_clip": 0.01149852, + "auxiliary_loss_mlp": 0.01103573, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00038505, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 3.4690717471941745, + "language_loss": 0.76281857, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78535283, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.489482879638672 + }, + { + "auxiliary_loss_clip": 0.01116115, + "auxiliary_loss_mlp": 0.01102478, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00033975, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.4784958861470234, + "language_loss": 0.66802573, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69021171, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.5987179279327393 + }, + { + "auxiliary_loss_clip": 0.01148109, + "auxiliary_loss_mlp": 0.01101835, + "balance_loss_clip": 1.00172126, + "balance_loss_mlp": 1.00036383, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.8763945146115704, + "language_loss": 0.69654858, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71904802, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 2.5509161949157715 + }, + { + "auxiliary_loss_clip": 0.01165129, + "auxiliary_loss_mlp": 0.01104508, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.0005579, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.3137290154646917, + "language_loss": 0.88961583, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91231221, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.47822904586792 + }, + { + "auxiliary_loss_clip": 0.01115567, + "auxiliary_loss_mlp": 0.01102071, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00050461, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.5935428258713058, + "language_loss": 0.59200132, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.6141777, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.7613985538482666 + }, + { + "auxiliary_loss_clip": 0.01116447, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_clip": 1.00160813, + "balance_loss_mlp": 1.00047314, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.8810477124233724, + "language_loss": 0.80311477, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82530582, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.6054298877716064 + }, + { + "auxiliary_loss_clip": 0.01129708, + "auxiliary_loss_mlp": 0.01103587, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0004952, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 2.056023003625443, + "language_loss": 0.78688318, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80921608, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.5543246269226074 + }, + { + "auxiliary_loss_clip": 0.01165142, + "auxiliary_loss_mlp": 0.01103919, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00034976, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.503499505505813, + "language_loss": 0.66739982, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69009042, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.481647491455078 + }, + { + "auxiliary_loss_clip": 0.01114393, + "auxiliary_loss_mlp": 0.01102243, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00039065, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.7146691186189638, + "language_loss": 0.57414651, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59631288, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.6765971183776855 + }, + { + "auxiliary_loss_clip": 0.01117028, + "auxiliary_loss_mlp": 0.01103688, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00050044, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.7841634596458942, + "language_loss": 0.79995745, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.82216465, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.623535633087158 + }, + { + "auxiliary_loss_clip": 0.01148612, + "auxiliary_loss_mlp": 0.01103638, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00045085, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 1.8757019835247608, + "language_loss": 0.72841465, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75093722, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.516998052597046 + }, + { + "auxiliary_loss_clip": 0.01119229, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00044513, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.7027223001077347, + "language_loss": 0.64178228, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66401184, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.5953848361968994 + }, + { + "auxiliary_loss_clip": 0.01133364, + "auxiliary_loss_mlp": 0.01103436, + "balance_loss_clip": 1.00175476, + "balance_loss_mlp": 1.00043917, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.6806659487178603, + "language_loss": 0.68474275, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70711076, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 4.083377838134766 + }, + { + "auxiliary_loss_clip": 0.01131568, + "auxiliary_loss_mlp": 0.01102191, + "balance_loss_clip": 1.0017873, + "balance_loss_mlp": 1.00052941, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.638246835520146, + "language_loss": 0.69427967, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71661723, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.572211742401123 + }, + { + "auxiliary_loss_clip": 0.01148268, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00039434, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.8663506166645243, + "language_loss": 0.70157564, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72053313, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 5.433186769485474 + }, + { + "auxiliary_loss_clip": 0.01132738, + "auxiliary_loss_mlp": 0.0110188, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.000314, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.8164025910667978, + "language_loss": 0.78445715, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80680335, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.582554817199707 + }, + { + "auxiliary_loss_clip": 0.01133675, + "auxiliary_loss_mlp": 0.01103932, + "balance_loss_clip": 1.00168979, + "balance_loss_mlp": 1.00045812, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.318986677451295, + "language_loss": 0.74059832, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76297438, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.574693202972412 + }, + { + "auxiliary_loss_clip": 0.01085388, + "auxiliary_loss_mlp": 0.01104205, + "balance_loss_clip": 1.00161028, + "balance_loss_mlp": 1.00054026, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.9735289194453327, + "language_loss": 0.72168702, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74358296, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.6980037689208984 + }, + { + "auxiliary_loss_clip": 0.01131542, + "auxiliary_loss_mlp": 0.01103737, + "balance_loss_clip": 1.001724, + "balance_loss_mlp": 1.00045443, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 3.4151922939010326, + "language_loss": 0.6832723, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70562506, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 2.6276094913482666 + }, + { + "auxiliary_loss_clip": 0.01131439, + "auxiliary_loss_mlp": 0.01103322, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00042105, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.699729417452346, + "language_loss": 0.58498836, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60733604, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.6099414825439453 + }, + { + "auxiliary_loss_clip": 0.01165227, + "auxiliary_loss_mlp": 0.01104947, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00051975, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.100006203700088, + "language_loss": 0.67062998, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69333172, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.484990358352661 + }, + { + "auxiliary_loss_clip": 0.0108836, + "auxiliary_loss_mlp": 0.01102741, + "balance_loss_clip": 1.00155318, + "balance_loss_mlp": 1.00050724, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.6649232007111932, + "language_loss": 0.81926686, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.84117782, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 2.7202537059783936 + }, + { + "auxiliary_loss_clip": 0.01129114, + "auxiliary_loss_mlp": 0.01079396, + "balance_loss_clip": 1.0011344, + "balance_loss_mlp": 1.00005043, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7939759879987129, + "language_loss": 0.58685982, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60894489, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.0396482944488525 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00048625, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 2.817273060705503, + "language_loss": 0.76700759, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78952205, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.529183864593506 + }, + { + "auxiliary_loss_clip": 0.01150431, + "auxiliary_loss_mlp": 0.01103968, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00039887, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.7170894485986643, + "language_loss": 0.65634203, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67888606, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.562917947769165 + }, + { + "auxiliary_loss_clip": 0.01150375, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.0005368, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 1.9773471970303873, + "language_loss": 0.78776723, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81030536, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.5213708877563477 + }, + { + "auxiliary_loss_clip": 0.01165154, + "auxiliary_loss_mlp": 0.01103803, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.0004251, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.565221279080875, + "language_loss": 0.7180618, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74075139, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.460975408554077 + }, + { + "auxiliary_loss_clip": 0.01114517, + "auxiliary_loss_mlp": 0.01101679, + "balance_loss_clip": 1.00161254, + "balance_loss_mlp": 1.00039899, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 2.1167653158437973, + "language_loss": 0.72487342, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74703532, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.5938720703125 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01101977, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00041103, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.2831267199188021, + "language_loss": 0.74033093, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76269877, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.597228765487671 + }, + { + "auxiliary_loss_clip": 0.01148296, + "auxiliary_loss_mlp": 0.01103613, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00032997, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.389512935221648, + "language_loss": 0.75886893, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78138804, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.4956889152526855 + }, + { + "auxiliary_loss_clip": 0.01131236, + "auxiliary_loss_mlp": 0.01103731, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00054312, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 2.043836408865767, + "language_loss": 0.8215338, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84388345, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.6170687675476074 + }, + { + "auxiliary_loss_clip": 0.01148509, + "auxiliary_loss_mlp": 0.01103339, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.00043774, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 1.7555561324397935, + "language_loss": 0.75824749, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78076595, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.4855387210845947 + }, + { + "auxiliary_loss_clip": 0.01150066, + "auxiliary_loss_mlp": 0.00747271, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00037861, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 2.123062560985968, + "language_loss": 0.75708944, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77606285, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.5549402236938477 + }, + { + "auxiliary_loss_clip": 0.01150235, + "auxiliary_loss_mlp": 0.01102384, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00043583, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.6043885570993885, + "language_loss": 0.78171575, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80424201, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.5101819038391113 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01102288, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00043583, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.2086073551231102, + "language_loss": 0.77666807, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.7990458, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.561197280883789 + }, + { + "auxiliary_loss_clip": 0.01032501, + "auxiliary_loss_mlp": 0.01102763, + "balance_loss_clip": 1.00136328, + "balance_loss_mlp": 1.00052881, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.8971377782280976, + "language_loss": 0.69336152, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71471417, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 2.905028820037842 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.0110506, + "balance_loss_clip": 1.00147593, + "balance_loss_mlp": 1.00044203, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.9440153155893103, + "language_loss": 0.60773015, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.62976468, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 4.285131931304932 + }, + { + "auxiliary_loss_clip": 0.01148862, + "auxiliary_loss_mlp": 0.01104793, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00046098, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.9132349126517394, + "language_loss": 0.65808058, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.68061715, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 2.497833013534546 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01102676, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.0004425, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 1.7798455365674186, + "language_loss": 0.89686781, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.919083, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.6158640384674072 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01101849, + "balance_loss_clip": 1.00181544, + "balance_loss_mlp": 1.00037801, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.6437320063116803, + "language_loss": 0.84908819, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87144101, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.6004743576049805 + }, + { + "auxiliary_loss_clip": 0.01133911, + "auxiliary_loss_mlp": 0.01102742, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00060332, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.9246627767779958, + "language_loss": 0.74521065, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76757717, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.570143699645996 + }, + { + "auxiliary_loss_clip": 0.011497, + "auxiliary_loss_mlp": 0.01102195, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.00053382, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 2.136059881609762, + "language_loss": 0.61833143, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.64085031, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.534306764602661 + }, + { + "auxiliary_loss_clip": 0.01143695, + "auxiliary_loss_mlp": 0.01079795, + "balance_loss_clip": 1.00111508, + "balance_loss_mlp": 1.00006807, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.83270950636208, + "language_loss": 0.58735436, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60958928, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 3.230023145675659 + }, + { + "auxiliary_loss_clip": 0.01133781, + "auxiliary_loss_mlp": 0.0110168, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.0004003, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.3537295596472447, + "language_loss": 0.76796305, + "learning_rate": 1.646005846335954e-07, + "loss": 0.79031765, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.623030185699463 + }, + { + "auxiliary_loss_clip": 0.01133971, + "auxiliary_loss_mlp": 0.01103419, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00051737, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.7034297814128396, + "language_loss": 0.75081646, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77319038, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.5695323944091797 + }, + { + "auxiliary_loss_clip": 0.01164988, + "auxiliary_loss_mlp": 0.01102829, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00040436, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.6644641466564252, + "language_loss": 0.7432884, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76596653, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.570333957672119 + }, + { + "auxiliary_loss_clip": 0.01133539, + "auxiliary_loss_mlp": 0.01102345, + "balance_loss_clip": 1.00162065, + "balance_loss_mlp": 1.00058854, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.6046180475268013, + "language_loss": 0.63776952, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66012836, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.558621883392334 + }, + { + "auxiliary_loss_clip": 0.01150441, + "auxiliary_loss_mlp": 0.01102613, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.0002836, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.6995300274912895, + "language_loss": 0.58263713, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60516769, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.5660059452056885 + }, + { + "auxiliary_loss_clip": 0.01149883, + "auxiliary_loss_mlp": 0.01102322, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00037456, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.923976046245515, + "language_loss": 0.68816674, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.71068877, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.533214569091797 + }, + { + "auxiliary_loss_clip": 0.01150413, + "auxiliary_loss_mlp": 0.01103474, + "balance_loss_clip": 1.00170553, + "balance_loss_mlp": 1.00038218, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 1.93546817668943, + "language_loss": 0.74059844, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76313728, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 3.9258370399475098 + }, + { + "auxiliary_loss_clip": 0.01133309, + "auxiliary_loss_mlp": 0.01102731, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00040185, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.719671527969454, + "language_loss": 0.78817213, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81053257, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.6137943267822266 + }, + { + "auxiliary_loss_clip": 0.01102355, + "auxiliary_loss_mlp": 0.01103725, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00053704, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.0605626504797296, + "language_loss": 0.66324168, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.6853025, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 4.143803834915161 + }, + { + "auxiliary_loss_clip": 0.01160379, + "auxiliary_loss_mlp": 0.01079415, + "balance_loss_clip": 1.00114012, + "balance_loss_mlp": 1.0000689, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.782790386237102, + "language_loss": 0.54555333, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56795132, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 2.9299745559692383 + }, + { + "auxiliary_loss_clip": 0.01130962, + "auxiliary_loss_mlp": 0.01103415, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00051308, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 1.9551275566135147, + "language_loss": 0.69533503, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71767873, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 4.086646556854248 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01102207, + "balance_loss_clip": 1.0016861, + "balance_loss_mlp": 1.00054526, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.5930261091903235, + "language_loss": 0.7609539, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78297961, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.7051444053649902 + }, + { + "auxiliary_loss_clip": 0.01150259, + "auxiliary_loss_mlp": 0.00747478, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00036979, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.6544563572063937, + "language_loss": 0.66296744, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68194473, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.7086684703826904 + }, + { + "auxiliary_loss_clip": 0.01164974, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00034511, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 3.0035585156706364, + "language_loss": 0.72815835, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75083864, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 2.5204555988311768 + }, + { + "auxiliary_loss_clip": 0.01165232, + "auxiliary_loss_mlp": 0.01104876, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00063968, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 4.820564378519752, + "language_loss": 0.69066143, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71336257, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 2.6134228706359863 + }, + { + "auxiliary_loss_clip": 0.01131403, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00156903, + "balance_loss_mlp": 1.00047541, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.1745976231158344, + "language_loss": 0.71034408, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.73268521, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.5741426944732666 + }, + { + "auxiliary_loss_clip": 0.01150449, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00040078, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.0306527842077626, + "language_loss": 0.83626342, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85524213, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.544616937637329 + }, + { + "auxiliary_loss_clip": 0.0114831, + "auxiliary_loss_mlp": 0.01102681, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00054288, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 5.10011865055654, + "language_loss": 0.72024792, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74275786, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.5002827644348145 + }, + { + "auxiliary_loss_clip": 0.01150152, + "auxiliary_loss_mlp": 0.00747388, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00039244, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 1.76395708673098, + "language_loss": 0.64169234, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66066778, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.5932157039642334 + }, + { + "auxiliary_loss_clip": 0.0111692, + "auxiliary_loss_mlp": 0.01103369, + "balance_loss_clip": 1.00167215, + "balance_loss_mlp": 1.00037241, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.7644637425114862, + "language_loss": 0.79735327, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81955618, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.6522133350372314 + }, + { + "auxiliary_loss_clip": 0.01150708, + "auxiliary_loss_mlp": 0.01102928, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00040841, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 2.585980936470304, + "language_loss": 0.70264816, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72518444, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.5113348960876465 + }, + { + "auxiliary_loss_clip": 0.01131941, + "auxiliary_loss_mlp": 0.00747209, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.00027084, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.443762506777935, + "language_loss": 0.83725941, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85605097, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.585991859436035 + }, + { + "auxiliary_loss_clip": 0.01148311, + "auxiliary_loss_mlp": 0.01103032, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00051212, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.5733737013640834, + "language_loss": 0.71282768, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73534107, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.5675904750823975 + }, + { + "auxiliary_loss_clip": 0.0113474, + "auxiliary_loss_mlp": 0.01104255, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.7053649795785588, + "language_loss": 0.7645058, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78689575, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.5624094009399414 + }, + { + "auxiliary_loss_clip": 0.011173, + "auxiliary_loss_mlp": 0.01103317, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00070167, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 1.9504304792191562, + "language_loss": 0.82961565, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85182178, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.650465726852417 + }, + { + "auxiliary_loss_clip": 0.01160391, + "auxiliary_loss_mlp": 0.01079386, + "balance_loss_clip": 1.00112414, + "balance_loss_mlp": 1.00004005, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.7992380093710467, + "language_loss": 0.56105787, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58345562, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.1085751056671143 + }, + { + "auxiliary_loss_clip": 0.01148725, + "auxiliary_loss_mlp": 0.01102103, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00044179, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.6121502977553235, + "language_loss": 0.66209972, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68460798, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 2.509234666824341 + }, + { + "auxiliary_loss_clip": 0.01164904, + "auxiliary_loss_mlp": 0.01103293, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00048697, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.7136408333347082, + "language_loss": 0.78827578, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.81095767, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.5707924365997314 + }, + { + "auxiliary_loss_clip": 0.01165017, + "auxiliary_loss_mlp": 0.01103123, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00050759, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.1299368163752876, + "language_loss": 0.78007519, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.80275661, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.4940876960754395 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01101001, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00048327, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.4038312735615972, + "language_loss": 0.71514964, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73780644, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.604724168777466 + }, + { + "auxiliary_loss_clip": 0.01165249, + "auxiliary_loss_mlp": 0.01103559, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00046706, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.409524986347432, + "language_loss": 0.65102267, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67371082, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.4612793922424316 + }, + { + "auxiliary_loss_clip": 0.01148125, + "auxiliary_loss_mlp": 0.01103081, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00056076, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.6198754881865909, + "language_loss": 0.70507497, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72758698, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 4.068518400192261 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01104082, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00041771, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.517308797165831, + "language_loss": 0.77737075, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79973006, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.616518497467041 + }, + { + "auxiliary_loss_clip": 0.01131745, + "auxiliary_loss_mlp": 0.01103359, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00045729, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.667782776546184, + "language_loss": 0.70921016, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73156118, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.5517420768737793 + }, + { + "auxiliary_loss_clip": 0.01134155, + "auxiliary_loss_mlp": 0.00747451, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00038803, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.6735432919872095, + "language_loss": 0.74119675, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76001287, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.6073272228240967 + }, + { + "auxiliary_loss_clip": 0.01116828, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00060701, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 1.8213851846392453, + "language_loss": 0.86650729, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.8887049, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.6003785133361816 + }, + { + "auxiliary_loss_clip": 0.0111465, + "auxiliary_loss_mlp": 0.01102749, + "balance_loss_clip": 1.00150037, + "balance_loss_mlp": 1.00042009, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.7130112366118433, + "language_loss": 0.74123096, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76340497, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.5881974697113037 + }, + { + "auxiliary_loss_clip": 0.01148416, + "auxiliary_loss_mlp": 0.00747478, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00043774, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.7230109952598323, + "language_loss": 0.67800361, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69696254, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 2.5450756549835205 + }, + { + "auxiliary_loss_clip": 0.01117197, + "auxiliary_loss_mlp": 0.01102347, + "balance_loss_clip": 1.00172436, + "balance_loss_mlp": 1.00039923, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.812481772132414, + "language_loss": 0.62178314, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64397854, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.6090996265411377 + }, + { + "auxiliary_loss_clip": 0.01148159, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00037599, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.6549517633083162, + "language_loss": 0.73431408, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75681794, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.5758721828460693 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.00747237, + "balance_loss_clip": 1.00150967, + "balance_loss_mlp": 1.00030208, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 2.214713462360871, + "language_loss": 0.72804612, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74651563, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.613539457321167 + }, + { + "auxiliary_loss_clip": 0.01148237, + "auxiliary_loss_mlp": 0.01103256, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00054538, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.728013700086676, + "language_loss": 0.75944436, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.78195935, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.4974873065948486 + }, + { + "auxiliary_loss_clip": 0.01133709, + "auxiliary_loss_mlp": 0.0110304, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00061512, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.723356200818898, + "language_loss": 0.66738045, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68974793, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.6639347076416016 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01101651, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00046611, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 2.429681265391202, + "language_loss": 0.66405141, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68640542, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.572824239730835 + }, + { + "auxiliary_loss_clip": 0.01148436, + "auxiliary_loss_mlp": 0.01103917, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00044322, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 8.187712148220307, + "language_loss": 0.71095788, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73348147, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 4.093372344970703 + }, + { + "auxiliary_loss_clip": 0.01165192, + "auxiliary_loss_mlp": 0.01104098, + "balance_loss_clip": 1.00193536, + "balance_loss_mlp": 1.00043321, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 2.176239016041455, + "language_loss": 0.70982361, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73251653, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.4792048931121826 + }, + { + "auxiliary_loss_clip": 0.0113521, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00047088, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.9210985131019551, + "language_loss": 0.70451468, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72688907, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 3.9438204765319824 + }, + { + "auxiliary_loss_clip": 0.01164991, + "auxiliary_loss_mlp": 0.00747311, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00034416, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.656062065494621, + "language_loss": 0.65549564, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67461872, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.5283071994781494 + }, + { + "auxiliary_loss_clip": 0.01131533, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_clip": 1.00177109, + "balance_loss_mlp": 1.00048625, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.5211833855876626, + "language_loss": 0.73566973, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75800461, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 4.132153272628784 + }, + { + "auxiliary_loss_clip": 0.0116501, + "auxiliary_loss_mlp": 0.00747377, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00033927, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.5261539372575945, + "language_loss": 0.6681118, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68723559, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.4963462352752686 + }, + { + "auxiliary_loss_clip": 0.01149494, + "auxiliary_loss_mlp": 0.00747333, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.000386, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.6636515164667565, + "language_loss": 0.79070312, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.8096714, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.538480758666992 + }, + { + "auxiliary_loss_clip": 0.01115111, + "auxiliary_loss_mlp": 0.01102684, + "balance_loss_clip": 1.00159991, + "balance_loss_mlp": 1.00035477, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5657327694906038, + "language_loss": 0.72393209, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74611002, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 2.62862491607666 + }, + { + "auxiliary_loss_clip": 0.01135747, + "auxiliary_loss_mlp": 0.01102377, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00042927, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 3.4209723686142706, + "language_loss": 0.7422021, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76458335, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.5717692375183105 + }, + { + "auxiliary_loss_clip": 0.01164906, + "auxiliary_loss_mlp": 0.01102941, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.0004214, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.859341735627391, + "language_loss": 0.78709972, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80977821, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.5034778118133545 + }, + { + "auxiliary_loss_clip": 0.01147881, + "auxiliary_loss_mlp": 0.01103309, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00050259, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.5567385697652407, + "language_loss": 0.73916781, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76167971, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.525473117828369 + }, + { + "auxiliary_loss_clip": 0.01083065, + "auxiliary_loss_mlp": 0.00747378, + "balance_loss_clip": 1.00159085, + "balance_loss_mlp": 1.00036991, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.8014155909530007, + "language_loss": 0.66652566, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68483007, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.6959686279296875 + }, + { + "auxiliary_loss_clip": 0.01148367, + "auxiliary_loss_mlp": 0.01103521, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.0005244, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.198358444195884, + "language_loss": 0.70395875, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72647762, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.5538761615753174 + }, + { + "auxiliary_loss_clip": 0.01134017, + "auxiliary_loss_mlp": 0.01104487, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00053644, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.173319799367642, + "language_loss": 0.74648458, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76886964, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.535248279571533 + }, + { + "auxiliary_loss_clip": 0.01164874, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00037503, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.9976952593592943, + "language_loss": 0.80005705, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82273185, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.4619510173797607 + }, + { + "auxiliary_loss_clip": 0.01164885, + "auxiliary_loss_mlp": 0.01102335, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00048208, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.5057136341396686, + "language_loss": 0.8261956, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84886777, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.495039463043213 + }, + { + "auxiliary_loss_clip": 0.01150535, + "auxiliary_loss_mlp": 0.0110163, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.0004456, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.492378591971658, + "language_loss": 0.75862122, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78114295, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.583681344985962 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00158858, + "balance_loss_mlp": 1.00044084, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.1909846221764884, + "language_loss": 0.77491915, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79698741, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.619199752807617 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01102681, + "balance_loss_clip": 1.00165915, + "balance_loss_mlp": 1.00044727, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 2.5900268205172727, + "language_loss": 0.70977449, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.73196471, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.6382687091827393 + }, + { + "auxiliary_loss_clip": 0.01114872, + "auxiliary_loss_mlp": 0.01103037, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00042117, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.641543093595609, + "language_loss": 0.85633254, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.87851167, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.6269757747650146 + }, + { + "auxiliary_loss_clip": 0.0114824, + "auxiliary_loss_mlp": 0.01103155, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00053954, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.6647023605727336, + "language_loss": 0.73060977, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75312376, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.5923194885253906 + }, + { + "auxiliary_loss_clip": 0.01150502, + "auxiliary_loss_mlp": 0.00747289, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00033474, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.5048335126998094, + "language_loss": 0.7760787, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79505658, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 2.5350453853607178 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.01103141, + "balance_loss_clip": 1.00173485, + "balance_loss_mlp": 1.00052512, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 4.60963678650541, + "language_loss": 0.68019384, + "learning_rate": 1.545407113589332e-07, + "loss": 0.70241302, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.6209168434143066 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01102213, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00045609, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.6765453011831624, + "language_loss": 0.69364482, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71615064, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.778808355331421 + }, + { + "auxiliary_loss_clip": 0.01149642, + "auxiliary_loss_mlp": 0.01104644, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00050235, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 2.0205633341681652, + "language_loss": 0.73296112, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75550401, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 3.9967081546783447 + }, + { + "auxiliary_loss_clip": 0.01164904, + "auxiliary_loss_mlp": 0.01102898, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.0004729, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.7793072106979415, + "language_loss": 0.70710605, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.72978407, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.4722204208374023 + }, + { + "auxiliary_loss_clip": 0.01128493, + "auxiliary_loss_mlp": 0.01079819, + "balance_loss_clip": 1.00110364, + "balance_loss_mlp": 1.00009203, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7398774717938015, + "language_loss": 0.54142118, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56350428, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 3.1521966457366943 + }, + { + "auxiliary_loss_clip": 0.01114518, + "auxiliary_loss_mlp": 0.01079806, + "balance_loss_clip": 1.00104082, + "balance_loss_mlp": 1.0000788, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7102112087861507, + "language_loss": 0.5928942, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61483741, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.2160451412200928 + }, + { + "auxiliary_loss_clip": 0.01100618, + "auxiliary_loss_mlp": 0.01103503, + "balance_loss_clip": 1.00169277, + "balance_loss_mlp": 1.00050569, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.5921792113640152, + "language_loss": 0.85225797, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87429917, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 2.9151368141174316 + }, + { + "auxiliary_loss_clip": 0.01165252, + "auxiliary_loss_mlp": 0.01104165, + "balance_loss_clip": 1.002033, + "balance_loss_mlp": 1.00040519, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 1.78183444451675, + "language_loss": 0.70550656, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72820067, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.4800384044647217 + }, + { + "auxiliary_loss_clip": 0.01133527, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_clip": 1.00171649, + "balance_loss_mlp": 1.00049186, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.9125001756988356, + "language_loss": 0.72312093, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74547493, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.592949151992798 + }, + { + "auxiliary_loss_clip": 0.01150373, + "auxiliary_loss_mlp": 0.01104373, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.000422, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 1.8411092190330396, + "language_loss": 0.87369752, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.896245, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.531154155731201 + }, + { + "auxiliary_loss_clip": 0.01099955, + "auxiliary_loss_mlp": 0.01103935, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00055671, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.5458423242974166, + "language_loss": 0.70224893, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72428781, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.661557197570801 + }, + { + "auxiliary_loss_clip": 0.01148537, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.00185025, + "balance_loss_mlp": 1.0004096, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.010776737763336, + "language_loss": 0.80281544, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82177478, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01165096, + "auxiliary_loss_mlp": 0.01103351, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.0004499, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.5446390651811426, + "language_loss": 0.76691711, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78960162, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.5007216930389404 + }, + { + "auxiliary_loss_clip": 0.01095556, + "auxiliary_loss_mlp": 0.01102267, + "balance_loss_clip": 1.0017252, + "balance_loss_mlp": 1.00051045, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.4039283677447056, + "language_loss": 0.72517699, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74715519, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.668738842010498 + }, + { + "auxiliary_loss_clip": 0.01114564, + "auxiliary_loss_mlp": 0.01079446, + "balance_loss_clip": 1.00098073, + "balance_loss_mlp": 1.00010014, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0423134476431972, + "language_loss": 0.64605033, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66799045, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 2.93438720703125 + }, + { + "auxiliary_loss_clip": 0.01160365, + "auxiliary_loss_mlp": 0.01079412, + "balance_loss_clip": 1.00112581, + "balance_loss_mlp": 1.00006628, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6685461480882926, + "language_loss": 0.58617824, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.608576, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.1195435523986816 + }, + { + "auxiliary_loss_clip": 0.01099808, + "auxiliary_loss_mlp": 0.01102503, + "balance_loss_clip": 1.00145936, + "balance_loss_mlp": 1.00045955, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 2.011032369969853, + "language_loss": 0.72890562, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.75092876, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 4.092304944992065 + }, + { + "auxiliary_loss_clip": 0.01160375, + "auxiliary_loss_mlp": 0.01079381, + "balance_loss_clip": 1.00113583, + "balance_loss_mlp": 1.00003552, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8335989111135835, + "language_loss": 0.57972884, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60212636, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.1521055698394775 + }, + { + "auxiliary_loss_clip": 0.01134307, + "auxiliary_loss_mlp": 0.01102565, + "balance_loss_clip": 1.00172424, + "balance_loss_mlp": 1.00042629, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.7327631469475595, + "language_loss": 0.83443761, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85680634, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 3.9757330417633057 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01101777, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00040126, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.612698299490346, + "language_loss": 0.69296587, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71529686, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 4.096489667892456 + }, + { + "auxiliary_loss_clip": 0.01100385, + "auxiliary_loss_mlp": 0.01103427, + "balance_loss_clip": 1.00156152, + "balance_loss_mlp": 1.000525, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.7096380174429255, + "language_loss": 0.77932346, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.80136162, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.663639783859253 + }, + { + "auxiliary_loss_clip": 0.01133876, + "auxiliary_loss_mlp": 0.01104279, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.7207035153539296, + "language_loss": 0.79101712, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81339872, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.610779047012329 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01103068, + "balance_loss_clip": 1.0016619, + "balance_loss_mlp": 1.000453, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.583015810383947, + "language_loss": 0.66307032, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68541783, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 2.593888521194458 + }, + { + "auxiliary_loss_clip": 0.01131738, + "auxiliary_loss_mlp": 0.01102564, + "balance_loss_clip": 1.00174272, + "balance_loss_mlp": 1.00052071, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.821667346371017, + "language_loss": 0.72981298, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75215596, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.5903472900390625 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01103057, + "balance_loss_clip": 1.00168109, + "balance_loss_mlp": 1.00044203, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.9090089136622659, + "language_loss": 0.78016019, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80220187, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.669283866882324 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.0110359, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00049806, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.7310439550226575, + "language_loss": 0.79785055, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.82038808, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.5630550384521484 + }, + { + "auxiliary_loss_clip": 0.01130939, + "auxiliary_loss_mlp": 0.01101797, + "balance_loss_clip": 1.00167131, + "balance_loss_mlp": 1.00051737, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.509441800548847, + "language_loss": 0.73958409, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76191139, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.597776174545288 + }, + { + "auxiliary_loss_clip": 0.01148265, + "auxiliary_loss_mlp": 0.01103274, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00037241, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.4321798182376932, + "language_loss": 0.71159005, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73410547, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.652601718902588 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01102557, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00041807, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.685694491721209, + "language_loss": 0.72472912, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74691713, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.599613666534424 + }, + { + "auxiliary_loss_clip": 0.01119659, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00174391, + "balance_loss_mlp": 1.00048304, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 3.1432256678802117, + "language_loss": 0.68765146, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.70988667, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.5826518535614014 + }, + { + "auxiliary_loss_clip": 0.01131194, + "auxiliary_loss_mlp": 0.01101455, + "balance_loss_clip": 1.001508, + "balance_loss_mlp": 1.00046134, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.424476972391012, + "language_loss": 0.68610597, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.7084325, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.622831106185913 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00050986, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.4809035391397773, + "language_loss": 0.74076927, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76310879, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.6432504653930664 + }, + { + "auxiliary_loss_clip": 0.01133381, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00171268, + "balance_loss_mlp": 1.00036907, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 2.154633790382501, + "language_loss": 0.69757986, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71638626, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.6041259765625 + }, + { + "auxiliary_loss_clip": 0.01131392, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.0004493, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 2.0558430509591457, + "language_loss": 0.65945852, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.6818012, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.5960891246795654 + }, + { + "auxiliary_loss_clip": 0.01120042, + "auxiliary_loss_mlp": 0.01102495, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00045145, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 2.361928112222001, + "language_loss": 0.8425625, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86478788, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.66251277923584 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.00747382, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00033712, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.8631009076094767, + "language_loss": 0.79964149, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81843114, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.6278815269470215 + }, + { + "auxiliary_loss_clip": 0.01129011, + "auxiliary_loss_mlp": 0.01102788, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00036359, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 2.1676823621742964, + "language_loss": 0.65423572, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67655373, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.596284866333008 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.01103027, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00060201, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.6209004217829972, + "language_loss": 0.7036615, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72604108, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.574205160140991 + }, + { + "auxiliary_loss_clip": 0.01133267, + "auxiliary_loss_mlp": 0.01103303, + "balance_loss_clip": 1.00204194, + "balance_loss_mlp": 1.00040197, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.8064548012104766, + "language_loss": 0.66338754, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68575329, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 2.5533552169799805 + }, + { + "auxiliary_loss_clip": 0.01148431, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00034547, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.58338336283101, + "language_loss": 0.58126777, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60378647, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 2.6688270568847656 + }, + { + "auxiliary_loss_clip": 0.01130523, + "auxiliary_loss_mlp": 0.01103108, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00049257, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.6977372260629737, + "language_loss": 0.74198246, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76431876, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 4.003427982330322 + }, + { + "auxiliary_loss_clip": 0.01135574, + "auxiliary_loss_mlp": 0.0110336, + "balance_loss_clip": 1.00172412, + "balance_loss_mlp": 1.00055385, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 2.280863737289244, + "language_loss": 0.69483531, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.7172246, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.580655813217163 + }, + { + "auxiliary_loss_clip": 0.01148601, + "auxiliary_loss_mlp": 0.01103625, + "balance_loss_clip": 1.00166357, + "balance_loss_mlp": 1.00034237, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.033074810459203, + "language_loss": 0.84629583, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86881804, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 2.496828317642212 + }, + { + "auxiliary_loss_clip": 0.0110081, + "auxiliary_loss_mlp": 0.01103459, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00055742, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.6221381882044574, + "language_loss": 0.78813541, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81017804, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.687263250350952 + }, + { + "auxiliary_loss_clip": 0.01148055, + "auxiliary_loss_mlp": 0.01101885, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00041449, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6391993816609542, + "language_loss": 0.73275137, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75525075, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.4980976581573486 + }, + { + "auxiliary_loss_clip": 0.0116511, + "auxiliary_loss_mlp": 0.00747485, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00044703, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 1.9362094056825532, + "language_loss": 0.79460865, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81373459, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.4713876247406006 + }, + { + "auxiliary_loss_clip": 0.01150083, + "auxiliary_loss_mlp": 0.01103241, + "balance_loss_clip": 1.00181592, + "balance_loss_mlp": 1.00043464, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 2.560800077457053, + "language_loss": 0.64229238, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66482562, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.5370829105377197 + }, + { + "auxiliary_loss_clip": 0.0112952, + "auxiliary_loss_mlp": 0.01103928, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.00045419, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.0764034456925926, + "language_loss": 0.77131605, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79365063, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.5212669372558594 + }, + { + "auxiliary_loss_clip": 0.0116505, + "auxiliary_loss_mlp": 0.01102781, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.0004518, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 1.8855930656138409, + "language_loss": 0.74582511, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.76850343, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.4827189445495605 + }, + { + "auxiliary_loss_clip": 0.01133632, + "auxiliary_loss_mlp": 0.01102637, + "balance_loss_clip": 1.00172353, + "balance_loss_mlp": 1.00040317, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 2.3217431274484914, + "language_loss": 0.65454817, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67691082, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.5958595275878906 + }, + { + "auxiliary_loss_clip": 0.0111619, + "auxiliary_loss_mlp": 0.01103679, + "balance_loss_clip": 1.00171924, + "balance_loss_mlp": 1.00039613, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3170685936894004, + "language_loss": 0.62073052, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.6429292, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.6443588733673096 + }, + { + "auxiliary_loss_clip": 0.01133478, + "auxiliary_loss_mlp": 0.01102719, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.5361854545357927, + "language_loss": 0.73085892, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75322092, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.6035196781158447 + }, + { + "auxiliary_loss_clip": 0.0115033, + "auxiliary_loss_mlp": 0.01103408, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00050628, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.7282527390273283, + "language_loss": 0.72236657, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74490392, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.512805700302124 + }, + { + "auxiliary_loss_clip": 0.01145837, + "auxiliary_loss_mlp": 0.01102495, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00045156, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 2.623326211750892, + "language_loss": 0.74482477, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76730812, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.5053551197052 + }, + { + "auxiliary_loss_clip": 0.01165148, + "auxiliary_loss_mlp": 0.01103447, + "balance_loss_clip": 1.00186813, + "balance_loss_mlp": 1.00045061, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.837747268205422, + "language_loss": 0.7123751, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73506105, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 3.890294313430786 + }, + { + "auxiliary_loss_clip": 0.01099858, + "auxiliary_loss_mlp": 0.01102601, + "balance_loss_clip": 1.00145316, + "balance_loss_mlp": 1.00055814, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.6971935761281791, + "language_loss": 0.7128979, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73492253, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 4.119926929473877 + }, + { + "auxiliary_loss_clip": 0.0109973, + "auxiliary_loss_mlp": 0.01102329, + "balance_loss_clip": 1.00158548, + "balance_loss_mlp": 1.00047684, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.8271191471478194, + "language_loss": 0.81409085, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83611143, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 2.652940273284912 + }, + { + "auxiliary_loss_clip": 0.0113311, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00048554, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.620383481917225, + "language_loss": 0.68624514, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70860726, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 4.092561483383179 + }, + { + "auxiliary_loss_clip": 0.01133964, + "auxiliary_loss_mlp": 0.01104733, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00059152, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.3732726664918604, + "language_loss": 0.83896649, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86135346, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.611607789993286 + }, + { + "auxiliary_loss_clip": 0.01133287, + "auxiliary_loss_mlp": 0.01103841, + "balance_loss_clip": 1.00181711, + "balance_loss_mlp": 1.00046253, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 2.8952591592209065, + "language_loss": 0.77010119, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79247242, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.6034717559814453 + }, + { + "auxiliary_loss_clip": 0.01131883, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_clip": 1.00176513, + "balance_loss_mlp": 1.0005734, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 1.8914277880178703, + "language_loss": 0.60395098, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62630171, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.552596092224121 + }, + { + "auxiliary_loss_clip": 0.01134662, + "auxiliary_loss_mlp": 0.01102606, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.0004673, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.6541173614460039, + "language_loss": 0.77522761, + "learning_rate": 1.455139770123972e-07, + "loss": 0.79760027, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 2.5987966060638428 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.01103, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00076663, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.5718037001804046, + "language_loss": 0.76841998, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79057771, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.6207778453826904 + }, + { + "auxiliary_loss_clip": 0.01104111, + "auxiliary_loss_mlp": 0.01101878, + "balance_loss_clip": 1.00149488, + "balance_loss_mlp": 1.00040674, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.7374401345076602, + "language_loss": 0.74066603, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76272595, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.618666648864746 + }, + { + "auxiliary_loss_clip": 0.01149353, + "auxiliary_loss_mlp": 0.00747301, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.5294486249326962, + "language_loss": 0.69929993, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71826649, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.6290903091430664 + }, + { + "auxiliary_loss_clip": 0.01067948, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00136495, + "balance_loss_mlp": 1.00052643, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.5404207393748492, + "language_loss": 0.8089987, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83070004, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 2.71579647064209 + }, + { + "auxiliary_loss_clip": 0.01133686, + "auxiliary_loss_mlp": 0.011032, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.000489, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 1.875157930017528, + "language_loss": 0.57734847, + "learning_rate": 1.447856667743117e-07, + "loss": 0.59971738, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 2.5522265434265137 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01103652, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 2.1360633499611215, + "language_loss": 0.83620185, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85872221, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.5189998149871826 + }, + { + "auxiliary_loss_clip": 0.01165023, + "auxiliary_loss_mlp": 0.0110317, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.00065041, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.6269252064864257, + "language_loss": 0.62173307, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64441502, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.479807138442993 + }, + { + "auxiliary_loss_clip": 0.01148275, + "auxiliary_loss_mlp": 0.01101805, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00042915, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.0037928309963564, + "language_loss": 0.57303071, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59553152, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.4800572395324707 + }, + { + "auxiliary_loss_clip": 0.01164994, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.00048578, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.871232379797493, + "language_loss": 0.71223086, + "learning_rate": 1.442042848491043e-07, + "loss": 0.7349118, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.461291551589966 + }, + { + "auxiliary_loss_clip": 0.01150436, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00066638, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 8.021299883544751, + "language_loss": 0.73192215, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75444984, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.579411745071411 + }, + { + "auxiliary_loss_clip": 0.01131556, + "auxiliary_loss_mlp": 0.01103301, + "balance_loss_clip": 1.00157833, + "balance_loss_mlp": 1.00039923, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 3.7136053192698517, + "language_loss": 0.85133433, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87368286, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.536083936691284 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01102682, + "balance_loss_clip": 1.00166428, + "balance_loss_mlp": 1.00044823, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5612371015855355, + "language_loss": 0.72387123, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74606812, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.632929563522339 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01079797, + "balance_loss_clip": 1.00111532, + "balance_loss_mlp": 1.00007021, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.7929672040501956, + "language_loss": 0.49316728, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51525718, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.2556657791137695 + }, + { + "auxiliary_loss_clip": 0.01133592, + "auxiliary_loss_mlp": 0.00747364, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00035882, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 1.9197354626552563, + "language_loss": 0.76524949, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78405905, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.5761663913726807 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01102319, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00037086, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 2.0370610163868177, + "language_loss": 0.79368377, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81604075, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.574434757232666 + }, + { + "auxiliary_loss_clip": 0.01112873, + "auxiliary_loss_mlp": 0.01080089, + "balance_loss_clip": 1.00166011, + "balance_loss_mlp": 0.99998015, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.688082350517052, + "language_loss": 0.54793429, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56986392, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 3.2827343940734863 + }, + { + "auxiliary_loss_clip": 0.01164943, + "auxiliary_loss_mlp": 0.01102837, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00041199, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 3.029689442575789, + "language_loss": 0.65302086, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67569864, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 3.86908221244812 + }, + { + "auxiliary_loss_clip": 0.01133642, + "auxiliary_loss_mlp": 0.01103491, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00049424, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 2.9830318384960233, + "language_loss": 0.71081781, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73318923, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 2.617744207382202 + }, + { + "auxiliary_loss_clip": 0.01131501, + "auxiliary_loss_mlp": 0.01102252, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00039995, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 2.181877139606294, + "language_loss": 0.63617647, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.65851402, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.589177131652832 + }, + { + "auxiliary_loss_clip": 0.01164946, + "auxiliary_loss_mlp": 0.01103142, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00043178, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.160619690494963, + "language_loss": 0.7684207, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79110157, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.457868814468384 + }, + { + "auxiliary_loss_clip": 0.0113363, + "auxiliary_loss_mlp": 0.01103647, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00045919, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.910981522916587, + "language_loss": 0.730533, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75290573, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.570878028869629 + }, + { + "auxiliary_loss_clip": 0.01100956, + "auxiliary_loss_mlp": 0.0110391, + "balance_loss_clip": 1.00171542, + "balance_loss_mlp": 1.00043619, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.9355413965179642, + "language_loss": 0.74601376, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76806241, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.6357991695404053 + }, + { + "auxiliary_loss_clip": 0.01116947, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_clip": 1.00173557, + "balance_loss_mlp": 1.00039494, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.7116659259686358, + "language_loss": 0.65692878, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67913216, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.6351327896118164 + }, + { + "auxiliary_loss_clip": 0.01148189, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00047851, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.6186378177833884, + "language_loss": 0.69536638, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71787155, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.5073280334472656 + }, + { + "auxiliary_loss_clip": 0.0108745, + "auxiliary_loss_mlp": 0.01103666, + "balance_loss_clip": 1.00158477, + "balance_loss_mlp": 1.00038314, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.7977669106452245, + "language_loss": 0.74242139, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76433259, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.6542606353759766 + }, + { + "auxiliary_loss_clip": 0.0109968, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_clip": 1.00161743, + "balance_loss_mlp": 1.00033784, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 6.30715600002669, + "language_loss": 0.63028944, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65230715, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.6031668186187744 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01103002, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.0004822, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 2.4010423559835896, + "language_loss": 0.68882573, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71133947, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.589507579803467 + }, + { + "auxiliary_loss_clip": 0.01147845, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00048709, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.8205897854508502, + "language_loss": 0.67174119, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69425064, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01103119, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00050414, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.3413656797569966, + "language_loss": 0.74577665, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76810133, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.63236665725708 + }, + { + "auxiliary_loss_clip": 0.01135321, + "auxiliary_loss_mlp": 0.01104323, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00056291, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4879251398299642, + "language_loss": 0.72890025, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.75129664, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 4.108774662017822 + }, + { + "auxiliary_loss_clip": 0.01115769, + "auxiliary_loss_mlp": 0.01104723, + "balance_loss_clip": 1.00168753, + "balance_loss_mlp": 1.00039077, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 2.135743530709944, + "language_loss": 0.51625043, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.53845537, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.5816543102264404 + }, + { + "auxiliary_loss_clip": 0.01115066, + "auxiliary_loss_mlp": 0.01103177, + "balance_loss_clip": 1.0016942, + "balance_loss_mlp": 1.00046611, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.444787885267211, + "language_loss": 0.61023974, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.63242221, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 3.9956111907958984 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.01101914, + "balance_loss_clip": 1.00190604, + "balance_loss_mlp": 1.00044286, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.561777382407413, + "language_loss": 0.75371087, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77637887, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.489323377609253 + }, + { + "auxiliary_loss_clip": 0.0114592, + "auxiliary_loss_mlp": 0.01102765, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00053084, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.8368526747045728, + "language_loss": 0.72718203, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74966884, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 4.0467588901519775 + }, + { + "auxiliary_loss_clip": 0.01148308, + "auxiliary_loss_mlp": 0.01102871, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00044656, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.6067728740967777, + "language_loss": 0.8012988, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82381058, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.5462822914123535 + }, + { + "auxiliary_loss_clip": 0.01100851, + "auxiliary_loss_mlp": 0.01102629, + "balance_loss_clip": 1.00163555, + "balance_loss_mlp": 1.00049067, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.8392870479951924, + "language_loss": 0.74833918, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.77037394, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.701573371887207 + }, + { + "auxiliary_loss_clip": 0.01133461, + "auxiliary_loss_mlp": 0.01103194, + "balance_loss_clip": 1.00177205, + "balance_loss_mlp": 1.00057828, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 1.7737532716464641, + "language_loss": 0.71700937, + "learning_rate": 1.401661576761779e-07, + "loss": 0.73937595, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 2.5777015686035156 + }, + { + "auxiliary_loss_clip": 0.01143745, + "auxiliary_loss_mlp": 0.01079763, + "balance_loss_clip": 1.00117517, + "balance_loss_mlp": 1.00003576, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.80788380254276, + "language_loss": 0.53669029, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55892539, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.159848928451538 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_clip": 1.00181723, + "balance_loss_mlp": 1.00035441, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.587220871505926, + "language_loss": 0.76768053, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.79004598, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.662130355834961 + }, + { + "auxiliary_loss_clip": 0.01116865, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00035489, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.8480670369654824, + "language_loss": 0.72993875, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75213331, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.6341187953948975 + }, + { + "auxiliary_loss_clip": 0.0113495, + "auxiliary_loss_mlp": 0.01104213, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00045347, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.8302869859565876, + "language_loss": 0.71597278, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73836446, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.650937795639038 + }, + { + "auxiliary_loss_clip": 0.01118941, + "auxiliary_loss_mlp": 0.01104273, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00051367, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.5765665935806341, + "language_loss": 0.71983302, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.74206519, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.824024200439453 + }, + { + "auxiliary_loss_clip": 0.01099111, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_clip": 1.0016228, + "balance_loss_mlp": 1.00048256, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.8832343940446379, + "language_loss": 0.66457289, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68657976, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.6310150623321533 + }, + { + "auxiliary_loss_clip": 0.01145918, + "auxiliary_loss_mlp": 0.01102322, + "balance_loss_clip": 1.00193012, + "balance_loss_mlp": 1.00037444, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.512279808357781, + "language_loss": 0.70385826, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72634065, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 2.559774875640869 + }, + { + "auxiliary_loss_clip": 0.01133597, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00043213, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.4478224003035036, + "language_loss": 0.7102865, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73264056, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.6526644229888916 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01102869, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00044417, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.5556175532709606, + "language_loss": 0.74498492, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76750696, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.5346226692199707 + }, + { + "auxiliary_loss_clip": 0.01113892, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_clip": 1.00101697, + "balance_loss_mlp": 1.00003672, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7931291143211583, + "language_loss": 0.60448492, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62641764, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 3.03825306892395 + }, + { + "auxiliary_loss_clip": 0.01133402, + "auxiliary_loss_mlp": 0.01101067, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00054979, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 2.1612876694274785, + "language_loss": 0.67163789, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.6939826, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.758527994155884 + }, + { + "auxiliary_loss_clip": 0.01132006, + "auxiliary_loss_mlp": 0.01104249, + "balance_loss_clip": 1.00170481, + "balance_loss_mlp": 1.00048971, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.4477314636309881, + "language_loss": 0.62694907, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.6493116, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.8327693939208984 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01101769, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00029826, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.2705839958569594, + "language_loss": 0.63949394, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66167533, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.5854997634887695 + }, + { + "auxiliary_loss_clip": 0.01116757, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.00173712, + "balance_loss_mlp": 1.00036073, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.0316837695502548, + "language_loss": 0.75998485, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78219557, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.6394102573394775 + }, + { + "auxiliary_loss_clip": 0.01085744, + "auxiliary_loss_mlp": 0.0110203, + "balance_loss_clip": 1.00158048, + "balance_loss_mlp": 1.00055957, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.0030498293863355, + "language_loss": 0.80719, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.82906777, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.655959129333496 + }, + { + "auxiliary_loss_clip": 0.01131267, + "auxiliary_loss_mlp": 0.01102758, + "balance_loss_clip": 1.00158036, + "balance_loss_mlp": 1.00033355, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.3979582755490851, + "language_loss": 0.55609381, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57843405, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.6247963905334473 + }, + { + "auxiliary_loss_clip": 0.01084643, + "auxiliary_loss_mlp": 0.01103092, + "balance_loss_clip": 1.00151277, + "balance_loss_mlp": 1.00047648, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.7897192541687046, + "language_loss": 0.73538971, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75726706, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.738492012023926 + }, + { + "auxiliary_loss_clip": 0.01150038, + "auxiliary_loss_mlp": 0.01103148, + "balance_loss_clip": 1.00195563, + "balance_loss_mlp": 1.0004369, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.572826259121003, + "language_loss": 0.7552166, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77774841, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 3.998429298400879 + }, + { + "auxiliary_loss_clip": 0.01118752, + "auxiliary_loss_mlp": 0.01103212, + "balance_loss_clip": 1.00171828, + "balance_loss_mlp": 1.000597, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 2.2590856119284592, + "language_loss": 0.71091747, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73313713, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 2.605557441711426 + }, + { + "auxiliary_loss_clip": 0.01149749, + "auxiliary_loss_mlp": 0.0110239, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00044191, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 18.277215734641604, + "language_loss": 0.74023342, + "learning_rate": 1.373156261464208e-07, + "loss": 0.7627548, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.615723133087158 + }, + { + "auxiliary_loss_clip": 0.01084207, + "auxiliary_loss_mlp": 0.01101962, + "balance_loss_clip": 1.0016377, + "balance_loss_mlp": 1.00039601, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 2.1430326548505136, + "language_loss": 0.78786105, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80972272, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 2.7366552352905273 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01102859, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00043452, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.6489985356259564, + "language_loss": 0.71892798, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74160683, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.5297491550445557 + }, + { + "auxiliary_loss_clip": 0.01131229, + "auxiliary_loss_mlp": 0.01103485, + "balance_loss_clip": 1.00164413, + "balance_loss_mlp": 1.00039303, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 1.795854789479132, + "language_loss": 0.82550824, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84785539, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.605280876159668 + }, + { + "auxiliary_loss_clip": 0.01133295, + "auxiliary_loss_mlp": 0.01103132, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00042152, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.4796576231241256, + "language_loss": 0.62446129, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64682555, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.8186490535736084 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01103288, + "balance_loss_clip": 1.00180733, + "balance_loss_mlp": 1.00048232, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.8347833004456546, + "language_loss": 0.68779564, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71031249, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.655576705932617 + }, + { + "auxiliary_loss_clip": 0.01118826, + "auxiliary_loss_mlp": 0.0110272, + "balance_loss_clip": 1.00167346, + "balance_loss_mlp": 1.00048578, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.8451810233565857, + "language_loss": 0.78040242, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80261791, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.621001720428467 + }, + { + "auxiliary_loss_clip": 0.01143816, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_clip": 1.00119233, + "balance_loss_mlp": 1.00001609, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8037589461341199, + "language_loss": 0.58929837, + "learning_rate": 1.363246127376143e-07, + "loss": 0.611534, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 2.999154567718506 + }, + { + "auxiliary_loss_clip": 0.01133705, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00037754, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 2.681345489658936, + "language_loss": 0.6894685, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70828009, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.5586366653442383 + }, + { + "auxiliary_loss_clip": 0.01150267, + "auxiliary_loss_mlp": 0.00747306, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00034332, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.218829967052742, + "language_loss": 0.69658792, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71556365, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.7226316928863525 + }, + { + "auxiliary_loss_clip": 0.01131247, + "auxiliary_loss_mlp": 0.01103324, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00042212, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.5665171647336937, + "language_loss": 0.70074952, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72309518, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 4.075528144836426 + }, + { + "auxiliary_loss_clip": 0.01116944, + "auxiliary_loss_mlp": 0.0110256, + "balance_loss_clip": 1.00177789, + "balance_loss_mlp": 1.00042129, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 1.9539552227000543, + "language_loss": 0.66772664, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68992174, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.5838160514831543 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_clip": 1.00198054, + "balance_loss_mlp": 1.00052905, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.7696529420057607, + "language_loss": 0.62728286, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.6495952, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 4.20318865776062 + }, + { + "auxiliary_loss_clip": 0.01115816, + "auxiliary_loss_mlp": 0.01102627, + "balance_loss_clip": 1.00164831, + "balance_loss_mlp": 1.00039256, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.4918335979983317, + "language_loss": 0.79277003, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.8149544, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.640468120574951 + }, + { + "auxiliary_loss_clip": 0.01117017, + "auxiliary_loss_mlp": 0.01102697, + "balance_loss_clip": 1.00164437, + "balance_loss_mlp": 1.00055814, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.6221407135834671, + "language_loss": 0.82774222, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.84993941, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.6200594902038574 + }, + { + "auxiliary_loss_clip": 0.01127328, + "auxiliary_loss_mlp": 0.01079469, + "balance_loss_clip": 1.00115836, + "balance_loss_mlp": 1.0001235, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.8968573513376898, + "language_loss": 0.59972298, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.62179101, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 4.500377655029297 + }, + { + "auxiliary_loss_clip": 0.01165173, + "auxiliary_loss_mlp": 0.00747325, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00033212, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.8758768765226173, + "language_loss": 0.66840774, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68753266, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.4895668029785156 + }, + { + "auxiliary_loss_clip": 0.01133633, + "auxiliary_loss_mlp": 0.01102498, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.0004071, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 1.8217502142464554, + "language_loss": 0.75324297, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77560431, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 2.5567305088043213 + }, + { + "auxiliary_loss_clip": 0.01119046, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_clip": 1.00177324, + "balance_loss_mlp": 1.00049019, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.7282541992605058, + "language_loss": 0.70371789, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72593081, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.612861156463623 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01103926, + "balance_loss_clip": 1.00196528, + "balance_loss_mlp": 1.00045276, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.7002461501524935, + "language_loss": 0.84621084, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86854529, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.5568811893463135 + }, + { + "auxiliary_loss_clip": 0.01118977, + "auxiliary_loss_mlp": 0.01103818, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00043988, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.100852097176699, + "language_loss": 0.67855299, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70078099, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.734837770462036 + }, + { + "auxiliary_loss_clip": 0.01148358, + "auxiliary_loss_mlp": 0.01104436, + "balance_loss_clip": 1.0017699, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.5495444617598653, + "language_loss": 0.74942708, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77195501, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.5356333255767822 + }, + { + "auxiliary_loss_clip": 0.01148286, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00048757, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.9935493950797485, + "language_loss": 0.87063086, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89314282, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.5016016960144043 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00047946, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.7936293904043323, + "language_loss": 0.63609564, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65812087, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.7014708518981934 + }, + { + "auxiliary_loss_clip": 0.01164977, + "auxiliary_loss_mlp": 0.01103586, + "balance_loss_clip": 1.00188577, + "balance_loss_mlp": 1.00039887, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 1.7813971760023728, + "language_loss": 0.72886699, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75155264, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.483614206314087 + }, + { + "auxiliary_loss_clip": 0.0115019, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00040019, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.7834732955128578, + "language_loss": 0.59364039, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61261588, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.5759432315826416 + }, + { + "auxiliary_loss_clip": 0.01116881, + "auxiliary_loss_mlp": 0.01104115, + "balance_loss_clip": 1.00176132, + "balance_loss_mlp": 1.00045037, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.634377344201445, + "language_loss": 0.60102558, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62323558, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.650947093963623 + }, + { + "auxiliary_loss_clip": 0.01145659, + "auxiliary_loss_mlp": 0.00747442, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00033927, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.5445512017213388, + "language_loss": 0.76621807, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78514904, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.563049793243408 + }, + { + "auxiliary_loss_clip": 0.01165107, + "auxiliary_loss_mlp": 0.0074736, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00039172, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.6107180613459975, + "language_loss": 0.77348161, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79260635, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.482114791870117 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0110366, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.0004729, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.2029551689366746, + "language_loss": 0.76646566, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.7888211, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.6358659267425537 + }, + { + "auxiliary_loss_clip": 0.01131389, + "auxiliary_loss_mlp": 0.00747282, + "balance_loss_clip": 1.00162196, + "balance_loss_mlp": 1.0003624, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.6983536177908842, + "language_loss": 0.82745624, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84624296, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.593648910522461 + }, + { + "auxiliary_loss_clip": 0.01148291, + "auxiliary_loss_mlp": 0.01103026, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00050569, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 1.790112339106357, + "language_loss": 0.77503604, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79754925, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.771782398223877 + }, + { + "auxiliary_loss_clip": 0.01079913, + "auxiliary_loss_mlp": 0.00747482, + "balance_loss_clip": 1.001701, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 11.552783573913503, + "language_loss": 0.69629943, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71457338, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.6869490146636963 + }, + { + "auxiliary_loss_clip": 0.01148161, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00044644, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 1.72319537051645, + "language_loss": 0.59321046, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61571699, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.543158769607544 + }, + { + "auxiliary_loss_clip": 0.01165026, + "auxiliary_loss_mlp": 0.0110379, + "balance_loss_clip": 1.00197279, + "balance_loss_mlp": 1.00041199, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.2789548579025953, + "language_loss": 0.81671536, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83940351, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.4734559059143066 + }, + { + "auxiliary_loss_clip": 0.01131686, + "auxiliary_loss_mlp": 0.01103691, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00050354, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 3.9579183520337065, + "language_loss": 0.80277199, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82512575, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 4.095592021942139 + }, + { + "auxiliary_loss_clip": 0.01164915, + "auxiliary_loss_mlp": 0.01103287, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00057614, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.5531471480398298, + "language_loss": 0.64949965, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67218167, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.01103334, + "balance_loss_clip": 1.00192916, + "balance_loss_mlp": 1.00043225, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 1.7879740835197075, + "language_loss": 0.74453706, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76722085, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 2.520303249359131 + }, + { + "auxiliary_loss_clip": 0.01135681, + "auxiliary_loss_mlp": 0.01103875, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00059199, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.4711880763597767, + "language_loss": 0.78000593, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.80240148, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.5853679180145264 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.01103226, + "balance_loss_clip": 1.00179493, + "balance_loss_mlp": 1.00041974, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8172354142544853, + "language_loss": 0.76533067, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78769159, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.5514864921569824 + }, + { + "auxiliary_loss_clip": 0.01088907, + "auxiliary_loss_mlp": 0.01102705, + "balance_loss_clip": 1.0016942, + "balance_loss_mlp": 1.00056672, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.973267306401379, + "language_loss": 0.6788944, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70081049, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 2.752734422683716 + }, + { + "auxiliary_loss_clip": 0.01165116, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00044787, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.7729243008452136, + "language_loss": 0.68974417, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71243548, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.7402729988098145 + }, + { + "auxiliary_loss_clip": 0.01164843, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_clip": 1.00178158, + "balance_loss_mlp": 1.00044727, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 3.0036971300363122, + "language_loss": 0.74715567, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76982999, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.4745795726776123 + }, + { + "auxiliary_loss_clip": 0.0113207, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00056136, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.3874254321481096, + "language_loss": 0.75912154, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78148174, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.5511436462402344 + }, + { + "auxiliary_loss_clip": 0.01164974, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00064015, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.588310300523158, + "language_loss": 0.61343861, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.6361171, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.567321538925171 + }, + { + "auxiliary_loss_clip": 0.01150307, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00044084, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.5784633273461919, + "language_loss": 0.64254659, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66508412, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.528963327407837 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.0074737, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00044489, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.4591784759372577, + "language_loss": 0.71330559, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73209763, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.5370023250579834 + }, + { + "auxiliary_loss_clip": 0.01165105, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00039399, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 1.9513629202595664, + "language_loss": 0.66110647, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68379235, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.562548875808716 + }, + { + "auxiliary_loss_clip": 0.0111626, + "auxiliary_loss_mlp": 0.01102284, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00062227, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.5583966953138342, + "language_loss": 0.76236141, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78454685, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 4.162464380264282 + }, + { + "auxiliary_loss_clip": 0.01133632, + "auxiliary_loss_mlp": 0.01102658, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00042415, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.7962258917883827, + "language_loss": 0.73431498, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75667787, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01164904, + "auxiliary_loss_mlp": 0.01102266, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00041318, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.7006013530733732, + "language_loss": 0.70931518, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73198688, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 3.9421589374542236 + }, + { + "auxiliary_loss_clip": 0.01150189, + "auxiliary_loss_mlp": 0.01102533, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00048995, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.6147002527518652, + "language_loss": 0.70230001, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72482729, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.549805164337158 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.0110238, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00033712, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.0093949645348075, + "language_loss": 0.67087603, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.6932317, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.552377939224243 + }, + { + "auxiliary_loss_clip": 0.01148094, + "auxiliary_loss_mlp": 0.01102336, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00057864, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 2.0621540651740093, + "language_loss": 0.65607125, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67857552, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 3.981323003768921 + }, + { + "auxiliary_loss_clip": 0.01133853, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00050998, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.5814291878698077, + "language_loss": 0.82206047, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84442449, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.6324548721313477 + }, + { + "auxiliary_loss_clip": 0.01135335, + "auxiliary_loss_mlp": 0.01101114, + "balance_loss_clip": 1.00170529, + "balance_loss_mlp": 1.00040627, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 7.566679428215902, + "language_loss": 0.76591349, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78827798, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 2.6020543575286865 + }, + { + "auxiliary_loss_clip": 0.01133103, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00039542, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.7442631171385783, + "language_loss": 0.75355721, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77591258, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.613093614578247 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.00747239, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00036097, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 3.076645154598173, + "language_loss": 0.72021294, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.73869014, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.6689751148223877 + }, + { + "auxiliary_loss_clip": 0.01165002, + "auxiliary_loss_mlp": 0.01102611, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00047231, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.6154436298491262, + "language_loss": 0.80047506, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82315123, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.481794834136963 + }, + { + "auxiliary_loss_clip": 0.01150358, + "auxiliary_loss_mlp": 0.01103415, + "balance_loss_clip": 1.00172818, + "balance_loss_mlp": 1.00032234, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 1.852642864802066, + "language_loss": 0.6916945, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71423221, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.5711514949798584 + }, + { + "auxiliary_loss_clip": 0.01150163, + "auxiliary_loss_mlp": 0.01102867, + "balance_loss_clip": 1.00176942, + "balance_loss_mlp": 1.00053811, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 2.8752437602703385, + "language_loss": 0.7039932, + "learning_rate": 1.2893372177522e-07, + "loss": 0.7265234, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.511943817138672 + }, + { + "auxiliary_loss_clip": 0.01165021, + "auxiliary_loss_mlp": 0.01102559, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00032544, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.5092865922087675, + "language_loss": 0.77537674, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79805255, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.489154577255249 + }, + { + "auxiliary_loss_clip": 0.01127648, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_clip": 1.00136781, + "balance_loss_mlp": 0.99999392, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8864612721641184, + "language_loss": 0.56736803, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58943796, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.934277296066284 + }, + { + "auxiliary_loss_clip": 0.01160426, + "auxiliary_loss_mlp": 0.01079368, + "balance_loss_clip": 1.00118852, + "balance_loss_mlp": 1.00002277, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7876618430053983, + "language_loss": 0.62449479, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64689273, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.1398746967315674 + }, + { + "auxiliary_loss_clip": 0.01044803, + "auxiliary_loss_mlp": 0.01079734, + "balance_loss_clip": 1.00079238, + "balance_loss_mlp": 1.00000727, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7866037821658057, + "language_loss": 0.58190429, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60314965, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.2156717777252197 + }, + { + "auxiliary_loss_clip": 0.01164853, + "auxiliary_loss_mlp": 0.0110203, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00055921, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.611227806439497, + "language_loss": 0.65956783, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68223667, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 3.137845993041992 + }, + { + "auxiliary_loss_clip": 0.01165136, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00046682, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5397394693399227, + "language_loss": 0.77652651, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79921728, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.513841152191162 + }, + { + "auxiliary_loss_clip": 0.01133937, + "auxiliary_loss_mlp": 0.01104516, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00056529, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 4.078540919783598, + "language_loss": 0.60322845, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62561297, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.652956247329712 + }, + { + "auxiliary_loss_clip": 0.01116987, + "auxiliary_loss_mlp": 0.01103236, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00042951, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.743919353724595, + "language_loss": 0.6500752, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67227745, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.594259023666382 + }, + { + "auxiliary_loss_clip": 0.01116325, + "auxiliary_loss_mlp": 0.00747528, + "balance_loss_clip": 1.00154483, + "balance_loss_mlp": 1.00043392, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.719234276094361, + "language_loss": 0.85293204, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87157059, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.5948052406311035 + }, + { + "auxiliary_loss_clip": 0.01131631, + "auxiliary_loss_mlp": 0.01102476, + "balance_loss_clip": 1.00184286, + "balance_loss_mlp": 1.00043297, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.0058659639357224, + "language_loss": 0.70280242, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.72514349, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.5888619422912598 + }, + { + "auxiliary_loss_clip": 0.01164819, + "auxiliary_loss_mlp": 0.01101628, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00044358, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.58671795265372, + "language_loss": 0.70449567, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72716022, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.588862895965576 + }, + { + "auxiliary_loss_clip": 0.01150138, + "auxiliary_loss_mlp": 0.01102933, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00031745, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.5848940420416522, + "language_loss": 0.70454085, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72707158, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.5617783069610596 + }, + { + "auxiliary_loss_clip": 0.01132714, + "auxiliary_loss_mlp": 0.01102723, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.000489, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.6686841438273115, + "language_loss": 0.72614527, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.74849963, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 4.324891567230225 + }, + { + "auxiliary_loss_clip": 0.01119835, + "auxiliary_loss_mlp": 0.0110299, + "balance_loss_clip": 1.00171208, + "balance_loss_mlp": 1.00056505, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.4588623760997421, + "language_loss": 0.74171412, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76394236, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.6710989475250244 + }, + { + "auxiliary_loss_clip": 0.01070463, + "auxiliary_loss_mlp": 0.01104501, + "balance_loss_clip": 1.00162506, + "balance_loss_mlp": 1.00045538, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 2.0058292531892885, + "language_loss": 0.66398013, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.6857298, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.7427875995635986 + }, + { + "auxiliary_loss_clip": 0.01116998, + "auxiliary_loss_mlp": 0.01102848, + "balance_loss_clip": 1.0015918, + "balance_loss_mlp": 1.0006144, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.5476480994474278, + "language_loss": 0.71860182, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.74080026, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 2.715688705444336 + }, + { + "auxiliary_loss_clip": 0.01148734, + "auxiliary_loss_mlp": 0.01104029, + "balance_loss_clip": 1.00173116, + "balance_loss_mlp": 1.00045991, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.5642557867565376, + "language_loss": 0.75519341, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77772105, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.525932550430298 + }, + { + "auxiliary_loss_clip": 0.01113926, + "auxiliary_loss_mlp": 0.01079409, + "balance_loss_clip": 1.00097871, + "balance_loss_mlp": 1.00006378, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7692165441290744, + "language_loss": 0.56114209, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.5830754, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 3.1046013832092285 + }, + { + "auxiliary_loss_clip": 0.01165132, + "auxiliary_loss_mlp": 0.01103569, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00047648, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 2.003979345381326, + "language_loss": 0.70508432, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72777128, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.5996475219726562 + }, + { + "auxiliary_loss_clip": 0.01131418, + "auxiliary_loss_mlp": 0.01079393, + "balance_loss_clip": 1.0011518, + "balance_loss_mlp": 1.00004768, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7505892153417933, + "language_loss": 0.58112121, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60322928, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.2325265407562256 + }, + { + "auxiliary_loss_clip": 0.01150422, + "auxiliary_loss_mlp": 0.01103288, + "balance_loss_clip": 1.00200307, + "balance_loss_mlp": 1.00038671, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.8464931626302168, + "language_loss": 0.79635519, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81889236, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.5509591102600098 + }, + { + "auxiliary_loss_clip": 0.01143942, + "auxiliary_loss_mlp": 0.01079381, + "balance_loss_clip": 1.00113964, + "balance_loss_mlp": 1.00003493, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8984366058086254, + "language_loss": 0.58097148, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60320473, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 3.01377272605896 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01102886, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00055695, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.6241770874690147, + "language_loss": 0.66478109, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68729693, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.5345914363861084 + }, + { + "auxiliary_loss_clip": 0.01116882, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00054073, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.2001342184653057, + "language_loss": 0.75878108, + "learning_rate": 1.256524149358682e-07, + "loss": 0.78100252, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.577399492263794 + }, + { + "auxiliary_loss_clip": 0.01148462, + "auxiliary_loss_mlp": 0.01102926, + "balance_loss_clip": 1.00171709, + "balance_loss_mlp": 1.00050175, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 2.376753968492671, + "language_loss": 0.73260677, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75512064, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 4.392985105514526 + }, + { + "auxiliary_loss_clip": 0.0113137, + "auxiliary_loss_mlp": 0.01102249, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00039649, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.9375945470649696, + "language_loss": 0.71938151, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.7417177, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.73738431930542 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01102935, + "balance_loss_clip": 1.00166941, + "balance_loss_mlp": 1.00041497, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.792327251395006, + "language_loss": 0.81532633, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83784139, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.595820426940918 + }, + { + "auxiliary_loss_clip": 0.01148305, + "auxiliary_loss_mlp": 0.01103271, + "balance_loss_clip": 1.0018779, + "balance_loss_mlp": 1.00036907, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.8937922361101514, + "language_loss": 0.67279482, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69531059, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 4.077308654785156 + }, + { + "auxiliary_loss_clip": 0.01133201, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_clip": 1.0017215, + "balance_loss_mlp": 1.00051999, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.8449723464945325, + "language_loss": 0.67034024, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69269884, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.5480260848999023 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_clip": 1.00161719, + "balance_loss_mlp": 1.00039363, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.862625637854431, + "language_loss": 0.7534138, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77574772, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.5601978302001953 + }, + { + "auxiliary_loss_clip": 0.01114565, + "auxiliary_loss_mlp": 0.01102293, + "balance_loss_clip": 1.00151873, + "balance_loss_mlp": 1.00044084, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.6891124287282508, + "language_loss": 0.81193507, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83410364, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 4.005499839782715 + }, + { + "auxiliary_loss_clip": 0.0115012, + "auxiliary_loss_mlp": 0.01102406, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.0003624, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.7809464873732712, + "language_loss": 0.68502843, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70755374, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 2.5766985416412354 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.01102818, + "balance_loss_clip": 1.00164151, + "balance_loss_mlp": 1.00029802, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 3.930923004306674, + "language_loss": 0.6992873, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72148931, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.603689432144165 + }, + { + "auxiliary_loss_clip": 0.01118834, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00033069, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 1.9951627300783537, + "language_loss": 0.6564126, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67507386, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 2.8870646953582764 + }, + { + "auxiliary_loss_clip": 0.01099445, + "auxiliary_loss_mlp": 0.01101788, + "balance_loss_clip": 1.00149536, + "balance_loss_mlp": 1.0003171, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.7232757768589984, + "language_loss": 0.6891458, + "learning_rate": 1.24162160341861e-07, + "loss": 0.71115816, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 2.618528366088867 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01105671, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00038564, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 1.736928332123166, + "language_loss": 0.75282258, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77523381, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.8312532901763916 + }, + { + "auxiliary_loss_clip": 0.01148265, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.00037599, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 1.9094521751475253, + "language_loss": 0.74191117, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76442182, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.5460519790649414 + }, + { + "auxiliary_loss_clip": 0.01118905, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00043368, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.9278787300597726, + "language_loss": 0.75055373, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77276653, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.6229171752929688 + }, + { + "auxiliary_loss_clip": 0.01133589, + "auxiliary_loss_mlp": 0.01103197, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00039077, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.7476832602816865, + "language_loss": 0.77755892, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79992688, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.58309268951416 + }, + { + "auxiliary_loss_clip": 0.01114849, + "auxiliary_loss_mlp": 0.0107978, + "balance_loss_clip": 1.00115442, + "balance_loss_mlp": 1.00005293, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7481481451996471, + "language_loss": 0.56574237, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58768868, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.287936210632324 + }, + { + "auxiliary_loss_clip": 0.01100457, + "auxiliary_loss_mlp": 0.01103448, + "balance_loss_clip": 1.00165331, + "balance_loss_mlp": 1.00054646, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.723941239052946, + "language_loss": 0.64446449, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66650355, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.742811679840088 + }, + { + "auxiliary_loss_clip": 0.01148342, + "auxiliary_loss_mlp": 0.01102682, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00044847, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.6718177206350113, + "language_loss": 0.78223193, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80474216, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.57267689704895 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.00747341, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00049806, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.619368980347264, + "language_loss": 0.7638135, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78241038, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.635995626449585 + }, + { + "auxiliary_loss_clip": 0.01144372, + "auxiliary_loss_mlp": 0.00746314, + "balance_loss_clip": 1.00142097, + "balance_loss_mlp": 1.00086713, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7887474279034696, + "language_loss": 0.59310073, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61200762, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 3.007054567337036 + }, + { + "auxiliary_loss_clip": 0.01148749, + "auxiliary_loss_mlp": 0.01103124, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00050926, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 3.015127755615332, + "language_loss": 0.69271702, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71523577, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01150303, + "auxiliary_loss_mlp": 0.01102135, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00037813, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.8992278933372118, + "language_loss": 0.69251531, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71503967, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.4944417476654053 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.01103245, + "balance_loss_clip": 1.00166035, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7039391749304102, + "language_loss": 0.70447826, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72653329, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.6718521118164062 + }, + { + "auxiliary_loss_clip": 0.01133454, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00054836, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.7392682722827841, + "language_loss": 0.7077356, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73010278, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.553882598876953 + }, + { + "auxiliary_loss_clip": 0.01148382, + "auxiliary_loss_mlp": 0.01103151, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00034499, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9361168432458584, + "language_loss": 0.75263375, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77514905, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.527428388595581 + }, + { + "auxiliary_loss_clip": 0.01149384, + "auxiliary_loss_mlp": 0.01102471, + "balance_loss_clip": 1.00177729, + "balance_loss_mlp": 1.00042772, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.7150281021402907, + "language_loss": 0.78163564, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80415416, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.5258684158325195 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01102496, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00045311, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.5348729456186185, + "language_loss": 0.75348204, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77563101, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 4.049916982650757 + }, + { + "auxiliary_loss_clip": 0.01164988, + "auxiliary_loss_mlp": 0.01103162, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00054693, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.4729777524158891, + "language_loss": 0.84102154, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86370301, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.5175962448120117 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.01101733, + "balance_loss_clip": 1.00175023, + "balance_loss_mlp": 1.00035775, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.320893862994197, + "language_loss": 0.74766117, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.77016008, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.619119644165039 + }, + { + "auxiliary_loss_clip": 0.01150301, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00042486, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 2.922559774113377, + "language_loss": 0.73209691, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75463223, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.522320508956909 + }, + { + "auxiliary_loss_clip": 0.01082599, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00039959, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.825321160376321, + "language_loss": 0.67105246, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68935341, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.7499639987945557 + }, + { + "auxiliary_loss_clip": 0.01150045, + "auxiliary_loss_mlp": 0.01103911, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00043738, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.895729323448692, + "language_loss": 0.73831117, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76085073, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.619718074798584 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01102969, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00054467, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.202002982299977, + "language_loss": 0.78838456, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81056809, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 2.6296722888946533 + }, + { + "auxiliary_loss_clip": 0.01164768, + "auxiliary_loss_mlp": 0.01102411, + "balance_loss_clip": 1.00175786, + "balance_loss_mlp": 1.00036776, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.4184243390905427, + "language_loss": 0.73973352, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76240528, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.5870933532714844 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01102485, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.0005374, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.035802977675497, + "language_loss": 0.68748176, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70984089, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.5524539947509766 + }, + { + "auxiliary_loss_clip": 0.01083213, + "auxiliary_loss_mlp": 0.01103078, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00046277, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 2.012493024363334, + "language_loss": 0.6748656, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69672859, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.6988346576690674 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01103088, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00037742, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.9257309256321484, + "language_loss": 0.76314604, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78568101, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.5304858684539795 + }, + { + "auxiliary_loss_clip": 0.01114399, + "auxiliary_loss_mlp": 0.00746478, + "balance_loss_clip": 1.00102949, + "balance_loss_mlp": 1.00108171, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.7747057639631034, + "language_loss": 0.49438959, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51299834, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.1706292629241943 + }, + { + "auxiliary_loss_clip": 0.01165233, + "auxiliary_loss_mlp": 0.0110491, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00048327, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 1.9635894088415262, + "language_loss": 0.63813061, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66083205, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 3.9293782711029053 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.00747334, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00039518, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 3.2361448374443986, + "language_loss": 0.68305033, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70168716, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.648285388946533 + }, + { + "auxiliary_loss_clip": 0.01164914, + "auxiliary_loss_mlp": 0.01102546, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.0005033, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.133441400623218, + "language_loss": 0.80272305, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.82539761, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.534787178039551 + }, + { + "auxiliary_loss_clip": 0.01133599, + "auxiliary_loss_mlp": 0.01103695, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00041223, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 1.8136088998142486, + "language_loss": 0.689219, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.71159196, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 3.9655399322509766 + }, + { + "auxiliary_loss_clip": 0.01104057, + "auxiliary_loss_mlp": 0.01103725, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00044179, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.2395874812181322, + "language_loss": 0.90755177, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.92962956, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.6280689239501953 + }, + { + "auxiliary_loss_clip": 0.01150489, + "auxiliary_loss_mlp": 0.01102177, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00042033, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.1502690903688038, + "language_loss": 0.72472012, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.7472468, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 4.0414299964904785 + }, + { + "auxiliary_loss_clip": 0.01118886, + "auxiliary_loss_mlp": 0.01103742, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00045872, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.8168188977571174, + "language_loss": 0.57145917, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59368551, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.827388048171997 + }, + { + "auxiliary_loss_clip": 0.0111333, + "auxiliary_loss_mlp": 0.01102367, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00041902, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.714958426580323, + "language_loss": 0.76770389, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78986084, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.608536958694458 + }, + { + "auxiliary_loss_clip": 0.01087342, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.0015645, + "balance_loss_mlp": 1.00054944, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 1.8575266276879074, + "language_loss": 0.69141006, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71330559, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.73781156539917 + }, + { + "auxiliary_loss_clip": 0.01148441, + "auxiliary_loss_mlp": 0.01104074, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00050545, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.682935862045767, + "language_loss": 0.81003731, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83256251, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.5711095333099365 + }, + { + "auxiliary_loss_clip": 0.01135498, + "auxiliary_loss_mlp": 0.01103036, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00061154, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.5758479224608155, + "language_loss": 0.74809712, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77048242, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.5754642486572266 + }, + { + "auxiliary_loss_clip": 0.01132965, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00043607, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.6953714476207347, + "language_loss": 0.78458691, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80694228, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.6476986408233643 + }, + { + "auxiliary_loss_clip": 0.01148615, + "auxiliary_loss_mlp": 0.01102905, + "balance_loss_clip": 1.00196826, + "balance_loss_mlp": 1.00048065, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.412716609656693, + "language_loss": 0.69365919, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71617436, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.5596749782562256 + }, + { + "auxiliary_loss_clip": 0.01100126, + "auxiliary_loss_mlp": 0.01103953, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.0004791, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.4913120094769874, + "language_loss": 0.67687118, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69891191, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.765862226486206 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01103078, + "balance_loss_clip": 1.00173378, + "balance_loss_mlp": 1.00046277, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.5864797500169594, + "language_loss": 0.74468255, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76706457, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.6241650581359863 + }, + { + "auxiliary_loss_clip": 0.01131347, + "auxiliary_loss_mlp": 0.01101997, + "balance_loss_clip": 1.0016917, + "balance_loss_mlp": 1.00043058, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.674546025221298, + "language_loss": 0.64656198, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66889542, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.6125094890594482 + }, + { + "auxiliary_loss_clip": 0.01164859, + "auxiliary_loss_mlp": 0.01102007, + "balance_loss_clip": 1.0018183, + "balance_loss_mlp": 1.00048852, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.6586416970554836, + "language_loss": 0.66556191, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68823063, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.519650459289551 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01103808, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00062013, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.4644116086226284, + "language_loss": 0.75182438, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77389932, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.6716065406799316 + }, + { + "auxiliary_loss_clip": 0.01148422, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00043654, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.4812192679334832, + "language_loss": 0.69388151, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71639818, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.590193510055542 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01101202, + "balance_loss_clip": 1.00146008, + "balance_loss_mlp": 1.00039876, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.930585981079948, + "language_loss": 0.75841659, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.78042996, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.649110794067383 + }, + { + "auxiliary_loss_clip": 0.01135745, + "auxiliary_loss_mlp": 0.01104118, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00045323, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.620865444271692, + "language_loss": 0.57468462, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59708327, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.6574034690856934 + }, + { + "auxiliary_loss_clip": 0.01133629, + "auxiliary_loss_mlp": 0.01103167, + "balance_loss_clip": 1.00149608, + "balance_loss_mlp": 1.00045621, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9807152532481305, + "language_loss": 0.63290554, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65527356, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.5434908866882324 + }, + { + "auxiliary_loss_clip": 0.01148107, + "auxiliary_loss_mlp": 0.01102486, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00044274, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 2.0856520319188854, + "language_loss": 0.78351963, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80602551, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.5395236015319824 + }, + { + "auxiliary_loss_clip": 0.01134295, + "auxiliary_loss_mlp": 0.01102015, + "balance_loss_clip": 1.00168419, + "balance_loss_mlp": 1.00044906, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 2.0002823093462174, + "language_loss": 0.70763689, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73000002, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.5762417316436768 + }, + { + "auxiliary_loss_clip": 0.01150612, + "auxiliary_loss_mlp": 0.01104408, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00055265, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.084017765240199, + "language_loss": 0.7582761, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.78082633, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.5070080757141113 + }, + { + "auxiliary_loss_clip": 0.01116751, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_clip": 1.00145221, + "balance_loss_mlp": 1.00041449, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.4340582466434677, + "language_loss": 0.71995229, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74214536, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 4.2485105991363525 + }, + { + "auxiliary_loss_clip": 0.01148668, + "auxiliary_loss_mlp": 0.01103876, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00040245, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 2.0376863980837143, + "language_loss": 0.83886564, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86139107, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.5851192474365234 + }, + { + "auxiliary_loss_clip": 0.01148707, + "auxiliary_loss_mlp": 0.01102068, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.00050223, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 2.0248841515563387, + "language_loss": 0.80381656, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82632434, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.58215594291687 + }, + { + "auxiliary_loss_clip": 0.011311, + "auxiliary_loss_mlp": 0.0110267, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00053191, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 6.7132946805375155, + "language_loss": 0.77357984, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79591757, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.6074416637420654 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.00747332, + "balance_loss_clip": 1.0019356, + "balance_loss_mlp": 1.00036764, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.4004442961674368, + "language_loss": 0.65446943, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67342639, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.5439062118530273 + }, + { + "auxiliary_loss_clip": 0.01100584, + "auxiliary_loss_mlp": 0.01079728, + "balance_loss_clip": 1.00132334, + "balance_loss_mlp": 1.00000119, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7919751749962721, + "language_loss": 0.55978525, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.58158839, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 3.274587869644165 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01102669, + "balance_loss_clip": 1.00167549, + "balance_loss_mlp": 1.00062609, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.8840590731367273, + "language_loss": 0.77037495, + "learning_rate": 1.16316031981331e-07, + "loss": 0.79288584, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.5298891067504883 + }, + { + "auxiliary_loss_clip": 0.01148119, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_clip": 1.00183094, + "balance_loss_mlp": 1.000453, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.4350193445429507, + "language_loss": 0.67070639, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69321167, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 2.5892791748046875 + }, + { + "auxiliary_loss_clip": 0.01164896, + "auxiliary_loss_mlp": 0.01102943, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.0005188, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 3.03621736555139, + "language_loss": 0.59897327, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.62165171, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.5088775157928467 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.0110264, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.00040579, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.809834062301514, + "language_loss": 0.7571218, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77929699, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.691791296005249 + }, + { + "auxiliary_loss_clip": 0.01117101, + "auxiliary_loss_mlp": 0.01104571, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00042939, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 1.9348028321023432, + "language_loss": 0.77604449, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79826117, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.6171019077301025 + }, + { + "auxiliary_loss_clip": 0.01148281, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00035632, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.7315334031138412, + "language_loss": 0.78465968, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80716836, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.5229134559631348 + }, + { + "auxiliary_loss_clip": 0.01103422, + "auxiliary_loss_mlp": 0.01102927, + "balance_loss_clip": 1.00163531, + "balance_loss_mlp": 1.00050211, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 1.910234561624402, + "language_loss": 0.7503773, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77244085, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.6356682777404785 + }, + { + "auxiliary_loss_clip": 0.01150068, + "auxiliary_loss_mlp": 0.01103368, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00027537, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.4997750357921438, + "language_loss": 0.76261216, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78514653, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 3.9385619163513184 + }, + { + "auxiliary_loss_clip": 0.01117007, + "auxiliary_loss_mlp": 0.01103414, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00051284, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.7716875782027999, + "language_loss": 0.74392319, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76612741, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.6078147888183594 + }, + { + "auxiliary_loss_clip": 0.01150425, + "auxiliary_loss_mlp": 0.01103288, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00048161, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.5672911193208576, + "language_loss": 0.8279224, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.85045958, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 4.229649305343628 + }, + { + "auxiliary_loss_clip": 0.01118089, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00158823, + "balance_loss_mlp": 1.00035024, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.5406412342768, + "language_loss": 0.67368698, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69234192, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.7262353897094727 + }, + { + "auxiliary_loss_clip": 0.01135687, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00053322, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 3.9651235984546616, + "language_loss": 0.74992919, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77233094, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.558377981185913 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.0110142, + "balance_loss_clip": 1.0016886, + "balance_loss_mlp": 1.00052142, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5385636797501605, + "language_loss": 0.72309268, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74541652, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 4.0465521812438965 + }, + { + "auxiliary_loss_clip": 0.01133658, + "auxiliary_loss_mlp": 0.01101657, + "balance_loss_clip": 1.00166261, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 3.209440736596552, + "language_loss": 0.7577005, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.78005368, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.571415424346924 + }, + { + "auxiliary_loss_clip": 0.01131998, + "auxiliary_loss_mlp": 0.01104596, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00045514, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 2.0419148692730125, + "language_loss": 0.81528509, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83765107, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.5861446857452393 + }, + { + "auxiliary_loss_clip": 0.01085643, + "auxiliary_loss_mlp": 0.01102029, + "balance_loss_clip": 1.00164056, + "balance_loss_mlp": 1.00036705, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.5577291952324688, + "language_loss": 0.63493073, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65680754, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 3.0319318771362305 + }, + { + "auxiliary_loss_clip": 0.01135291, + "auxiliary_loss_mlp": 0.01103755, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00047219, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.9504815887264013, + "language_loss": 0.60915154, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63154197, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.5768954753875732 + }, + { + "auxiliary_loss_clip": 0.0116506, + "auxiliary_loss_mlp": 0.0110395, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00038147, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.764029726302395, + "language_loss": 0.6955964, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.71828651, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.566540002822876 + }, + { + "auxiliary_loss_clip": 0.01148479, + "auxiliary_loss_mlp": 0.00747454, + "balance_loss_clip": 1.00170922, + "balance_loss_mlp": 1.00043881, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.3779450863846074, + "language_loss": 0.71124268, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73020208, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.492978811264038 + }, + { + "auxiliary_loss_clip": 0.01055708, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00132537, + "balance_loss_mlp": 1.00025976, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.7136855708370897, + "language_loss": 0.75863087, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77666056, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 2.817239761352539 + }, + { + "auxiliary_loss_clip": 0.01083557, + "auxiliary_loss_mlp": 0.01102736, + "balance_loss_clip": 1.00156832, + "balance_loss_mlp": 1.00040674, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.6804241190463935, + "language_loss": 0.76574695, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78760982, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 2.6814494132995605 + }, + { + "auxiliary_loss_clip": 0.01148603, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00049138, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.4342505401927839, + "language_loss": 0.81607044, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83858949, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 2.6217567920684814 + }, + { + "auxiliary_loss_clip": 0.01113831, + "auxiliary_loss_mlp": 0.01101458, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.0004636, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.761141768085193, + "language_loss": 0.74856818, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.77072102, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.650277853012085 + }, + { + "auxiliary_loss_clip": 0.01148644, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.00168753, + "balance_loss_mlp": 1.00045562, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.8948020752585493, + "language_loss": 0.67090994, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.69343948, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.531571626663208 + }, + { + "auxiliary_loss_clip": 0.01148254, + "auxiliary_loss_mlp": 0.01104175, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00031996, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.5226776847845522, + "language_loss": 0.67224342, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69476771, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.555001735687256 + }, + { + "auxiliary_loss_clip": 0.01148675, + "auxiliary_loss_mlp": 0.01102435, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00039172, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.64587088688906, + "language_loss": 0.76081133, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78332245, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.514601945877075 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.00746389, + "balance_loss_clip": 1.00108862, + "balance_loss_mlp": 1.00094092, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7439703141982186, + "language_loss": 0.55378002, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57224619, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.258747100830078 + }, + { + "auxiliary_loss_clip": 0.01165001, + "auxiliary_loss_mlp": 0.00747496, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.00043929, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.3405826850396791, + "language_loss": 0.70039552, + "learning_rate": 1.12808298352008e-07, + "loss": 0.71952051, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.542175769805908 + }, + { + "auxiliary_loss_clip": 0.01083952, + "auxiliary_loss_mlp": 0.01103673, + "balance_loss_clip": 1.00164592, + "balance_loss_mlp": 1.00058043, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.6055700937192594, + "language_loss": 0.73841691, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76029313, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.6827902793884277 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01079725, + "balance_loss_clip": 1.00110912, + "balance_loss_mlp": 0.99999756, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7907078097044916, + "language_loss": 0.61836362, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.64015907, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.2039730548858643 + }, + { + "auxiliary_loss_clip": 0.01148307, + "auxiliary_loss_mlp": 0.0110328, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00037837, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.6227267403691454, + "language_loss": 0.70742786, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72994369, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.5912811756134033 + }, + { + "auxiliary_loss_clip": 0.01133693, + "auxiliary_loss_mlp": 0.01103357, + "balance_loss_clip": 1.00162673, + "balance_loss_mlp": 1.00055134, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 2.0097902418643683, + "language_loss": 0.77780086, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80017132, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.613434314727783 + }, + { + "auxiliary_loss_clip": 0.0113431, + "auxiliary_loss_mlp": 0.01102968, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00044787, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.909658051125394, + "language_loss": 0.72957385, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75194669, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 3.9855430126190186 + }, + { + "auxiliary_loss_clip": 0.01148303, + "auxiliary_loss_mlp": 0.01103342, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.0004406, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 4.622580894259183, + "language_loss": 0.74823105, + "learning_rate": 1.12035883275166e-07, + "loss": 0.77074748, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 2.544139862060547 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01101224, + "balance_loss_clip": 1.00172603, + "balance_loss_mlp": 1.00042069, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.6434334008998777, + "language_loss": 0.76242334, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78493667, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.5524919033050537 + }, + { + "auxiliary_loss_clip": 0.01148373, + "auxiliary_loss_mlp": 0.01102569, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00052619, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.8470510712840982, + "language_loss": 0.74209511, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76460457, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.5107436180114746 + }, + { + "auxiliary_loss_clip": 0.01150278, + "auxiliary_loss_mlp": 0.01102722, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00048757, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.8508639034856469, + "language_loss": 0.82854044, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85107046, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.5283443927764893 + }, + { + "auxiliary_loss_clip": 0.01131464, + "auxiliary_loss_mlp": 0.01103826, + "balance_loss_clip": 1.00168836, + "balance_loss_mlp": 1.00054348, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 2.1178510983303, + "language_loss": 0.70355207, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72590494, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 2.586855411529541 + }, + { + "auxiliary_loss_clip": 0.01115521, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.00068212, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 1.7318678446495934, + "language_loss": 0.72179514, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74398625, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.605628252029419 + }, + { + "auxiliary_loss_clip": 0.01148144, + "auxiliary_loss_mlp": 0.0110325, + "balance_loss_clip": 1.00177824, + "balance_loss_mlp": 1.00044441, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.0513565357852768, + "language_loss": 0.63056153, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65307546, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.569643020629883 + }, + { + "auxiliary_loss_clip": 0.01131015, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00042915, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.5878276798894189, + "language_loss": 0.74982315, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76860833, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.568143129348755 + }, + { + "auxiliary_loss_clip": 0.01134785, + "auxiliary_loss_mlp": 0.01103204, + "balance_loss_clip": 1.00171578, + "balance_loss_mlp": 1.00049305, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 2.1427928591546146, + "language_loss": 0.81666601, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.83904588, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.6088390350341797 + }, + { + "auxiliary_loss_clip": 0.01148452, + "auxiliary_loss_mlp": 0.01104064, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00059092, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.6965390196632986, + "language_loss": 0.61447215, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63699734, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.5358493328094482 + }, + { + "auxiliary_loss_clip": 0.01129813, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_clip": 1.00143433, + "balance_loss_mlp": 1.00005388, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7165899821148183, + "language_loss": 0.550354, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57244611, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.1783010959625244 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01101995, + "balance_loss_clip": 1.00157022, + "balance_loss_mlp": 1.00033319, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 8.394800808450139, + "language_loss": 0.71447146, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73665577, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 4.080078601837158 + }, + { + "auxiliary_loss_clip": 0.01131622, + "auxiliary_loss_mlp": 0.01102958, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00053287, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.666231933629649, + "language_loss": 0.78177571, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80412155, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.617422342300415 + }, + { + "auxiliary_loss_clip": 0.01148547, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00057614, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.5816617013775995, + "language_loss": 0.68309593, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70562387, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.603109121322632 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01102863, + "balance_loss_clip": 1.00158429, + "balance_loss_mlp": 1.00053358, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.8287506330879415, + "language_loss": 0.83821595, + "learning_rate": 1.102436060943881e-07, + "loss": 0.86022985, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 4.158153772354126 + }, + { + "auxiliary_loss_clip": 0.0116502, + "auxiliary_loss_mlp": 0.00747512, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00035012, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.528254017633699, + "language_loss": 0.72562295, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74474823, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.4757816791534424 + }, + { + "auxiliary_loss_clip": 0.0115035, + "auxiliary_loss_mlp": 0.01104244, + "balance_loss_clip": 1.00198185, + "balance_loss_mlp": 1.00057971, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.4110928261122906, + "language_loss": 0.91134971, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93389565, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 3.8804807662963867 + }, + { + "auxiliary_loss_clip": 0.01086689, + "auxiliary_loss_mlp": 0.01103109, + "balance_loss_clip": 1.00163388, + "balance_loss_mlp": 1.00039804, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.917631108622834, + "language_loss": 0.73542488, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75732285, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.679198741912842 + }, + { + "auxiliary_loss_clip": 0.01101471, + "auxiliary_loss_mlp": 0.0110298, + "balance_loss_clip": 1.00157142, + "balance_loss_mlp": 1.00046039, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 2.3696015653615947, + "language_loss": 0.70414376, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72618824, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.6577816009521484 + }, + { + "auxiliary_loss_clip": 0.01131438, + "auxiliary_loss_mlp": 0.01103333, + "balance_loss_clip": 1.0017097, + "balance_loss_mlp": 1.00043178, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.0509587629870194, + "language_loss": 0.70454895, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72689664, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.557847261428833 + }, + { + "auxiliary_loss_clip": 0.01150274, + "auxiliary_loss_mlp": 0.01102794, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00055981, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.5796877316290212, + "language_loss": 0.71865857, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74118924, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.568155288696289 + }, + { + "auxiliary_loss_clip": 0.01135422, + "auxiliary_loss_mlp": 0.00747486, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00043392, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.5002960788979731, + "language_loss": 0.82167715, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.8405062, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.626093626022339 + }, + { + "auxiliary_loss_clip": 0.01098024, + "auxiliary_loss_mlp": 0.01103116, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00040531, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.5753997306132843, + "language_loss": 0.7916522, + "learning_rate": 1.092257529095555e-07, + "loss": 0.8136636, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.6894991397857666 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01102742, + "balance_loss_clip": 1.00188196, + "balance_loss_mlp": 1.00041258, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5513446959235533, + "language_loss": 0.66706157, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68942475, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.7132298946380615 + }, + { + "auxiliary_loss_clip": 0.01131254, + "auxiliary_loss_mlp": 0.0110311, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.6206759294969784, + "language_loss": 0.70571542, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72805905, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.6126484870910645 + }, + { + "auxiliary_loss_clip": 0.01132912, + "auxiliary_loss_mlp": 0.0110315, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.5574053286255527, + "language_loss": 0.67627287, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69863355, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.5988821983337402 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01103091, + "balance_loss_clip": 1.00173485, + "balance_loss_mlp": 1.00047565, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 4.088877258214939, + "language_loss": 0.75191957, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77428722, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.562934398651123 + }, + { + "auxiliary_loss_clip": 0.01149496, + "auxiliary_loss_mlp": 0.01102326, + "balance_loss_clip": 1.00195158, + "balance_loss_mlp": 1.00056851, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.6313199876842015, + "language_loss": 0.63175136, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65426958, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.5492215156555176 + }, + { + "auxiliary_loss_clip": 0.01148088, + "auxiliary_loss_mlp": 0.01102735, + "balance_loss_clip": 1.00172544, + "balance_loss_mlp": 1.00040555, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.6183363810529776, + "language_loss": 0.71845114, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.74095941, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.01101686, + "auxiliary_loss_mlp": 0.01103205, + "balance_loss_clip": 1.00163555, + "balance_loss_mlp": 1.00049412, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.6306079480480045, + "language_loss": 0.74399579, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76604474, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.724281072616577 + }, + { + "auxiliary_loss_clip": 0.01118921, + "auxiliary_loss_mlp": 0.01102233, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00038123, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.5970376539335276, + "language_loss": 0.60328758, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62549913, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.641141891479492 + }, + { + "auxiliary_loss_clip": 0.01114487, + "auxiliary_loss_mlp": 0.01102615, + "balance_loss_clip": 1.00143957, + "balance_loss_mlp": 1.00038075, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 1.766576593383994, + "language_loss": 0.76636577, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78853679, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.6349432468414307 + }, + { + "auxiliary_loss_clip": 0.01133938, + "auxiliary_loss_mlp": 0.01103001, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00048113, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.6020151169040917, + "language_loss": 0.74004549, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76241487, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.590522050857544 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01079337, + "balance_loss_clip": 1.00117803, + "balance_loss_mlp": 0.99999142, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8406367527611732, + "language_loss": 0.63499838, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65693849, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.078169584274292 + }, + { + "auxiliary_loss_clip": 0.01133468, + "auxiliary_loss_mlp": 0.01102201, + "balance_loss_clip": 1.0018034, + "balance_loss_mlp": 1.00044394, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 1.9920357160561983, + "language_loss": 0.80401695, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.8263737, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.5525684356689453 + }, + { + "auxiliary_loss_clip": 0.01110744, + "auxiliary_loss_mlp": 0.01079769, + "balance_loss_clip": 1.00124454, + "balance_loss_mlp": 1.00004196, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7186740919473702, + "language_loss": 0.52893662, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.55084181, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 3.3046693801879883 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01103493, + "balance_loss_clip": 1.00183368, + "balance_loss_mlp": 1.00040114, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.6796203520625514, + "language_loss": 0.77837509, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80105931, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.5029637813568115 + }, + { + "auxiliary_loss_clip": 0.01149809, + "auxiliary_loss_mlp": 0.01103871, + "balance_loss_clip": 1.00181448, + "balance_loss_mlp": 1.00049281, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 1.853868696497119, + "language_loss": 0.73146904, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75400585, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 3.9958558082580566 + }, + { + "auxiliary_loss_clip": 0.01135185, + "auxiliary_loss_mlp": 0.01103642, + "balance_loss_clip": 1.00167525, + "balance_loss_mlp": 1.0004549, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.24630175022967, + "language_loss": 0.79878139, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82116961, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 2.570834159851074 + }, + { + "auxiliary_loss_clip": 0.01133854, + "auxiliary_loss_mlp": 0.01103525, + "balance_loss_clip": 1.00161827, + "balance_loss_mlp": 1.00043273, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4693735867108817, + "language_loss": 0.7137115, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.7360853, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.5880722999572754 + }, + { + "auxiliary_loss_clip": 0.01121238, + "auxiliary_loss_mlp": 0.01104368, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.0005126, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 1.8154887735544525, + "language_loss": 0.7613132, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78356922, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.6225123405456543 + }, + { + "auxiliary_loss_clip": 0.01165217, + "auxiliary_loss_mlp": 0.01104572, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00033545, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.154372130079618, + "language_loss": 0.73748708, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.760185, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.523087978363037 + }, + { + "auxiliary_loss_clip": 0.01102371, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00039315, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 1.7646246106550532, + "language_loss": 0.64327967, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66534013, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.6678285598754883 + }, + { + "auxiliary_loss_clip": 0.01131518, + "auxiliary_loss_mlp": 0.01102629, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00049031, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.070244810758526, + "language_loss": 0.69901645, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72135794, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.5973103046417236 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.01102061, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00039959, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7632550309223967, + "language_loss": 0.74550605, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76769292, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.815476894378662 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.01103518, + "balance_loss_clip": 1.00158381, + "balance_loss_mlp": 1.00042534, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.8135029148534854, + "language_loss": 0.75867695, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78071284, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.699554681777954 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01103013, + "balance_loss_clip": 1.00198627, + "balance_loss_mlp": 1.00049329, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.575625680543952, + "language_loss": 0.66858041, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.69096476, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.560377836227417 + }, + { + "auxiliary_loss_clip": 0.01165138, + "auxiliary_loss_mlp": 0.01103965, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00039577, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 2.1794020836726293, + "language_loss": 0.74174881, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76443982, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.555070638656616 + }, + { + "auxiliary_loss_clip": 0.01164967, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00052035, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.0822733007696765, + "language_loss": 0.57113367, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.59381568, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 3.852097749710083 + }, + { + "auxiliary_loss_clip": 0.01133299, + "auxiliary_loss_mlp": 0.0110341, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00041294, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 1.934358171305938, + "language_loss": 0.8177833, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84015036, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.584946870803833 + }, + { + "auxiliary_loss_clip": 0.0116487, + "auxiliary_loss_mlp": 0.01102379, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00052643, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.71559159233231, + "language_loss": 0.59914786, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62182033, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.5491209030151367 + }, + { + "auxiliary_loss_clip": 0.01149349, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.12122396552731, + "language_loss": 0.55226564, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57478762, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.5447378158569336 + }, + { + "auxiliary_loss_clip": 0.01101742, + "auxiliary_loss_mlp": 0.01103838, + "balance_loss_clip": 1.00163138, + "balance_loss_mlp": 1.00046027, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.667778282744728, + "language_loss": 0.79641891, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.81847471, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 4.306823492050171 + }, + { + "auxiliary_loss_clip": 0.01165099, + "auxiliary_loss_mlp": 0.01103335, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.00052881, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.6285375861553042, + "language_loss": 0.78733194, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.81001627, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.5198721885681152 + }, + { + "auxiliary_loss_clip": 0.01068045, + "auxiliary_loss_mlp": 0.01103137, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00052202, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.3942941322595857, + "language_loss": 0.74774057, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76945245, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 4.1652991771698 + }, + { + "auxiliary_loss_clip": 0.01164874, + "auxiliary_loss_mlp": 0.01102306, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00045347, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.8068450813702956, + "language_loss": 0.68410301, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70677483, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.474573850631714 + }, + { + "auxiliary_loss_clip": 0.01131094, + "auxiliary_loss_mlp": 0.01102646, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00041258, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.6347445115397392, + "language_loss": 0.6591503, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68148768, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.615041971206665 + }, + { + "auxiliary_loss_clip": 0.01133336, + "auxiliary_loss_mlp": 0.01101222, + "balance_loss_clip": 1.00162971, + "balance_loss_mlp": 1.00041914, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.4797712167853954, + "language_loss": 0.83206093, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85440648, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.5786936283111572 + }, + { + "auxiliary_loss_clip": 0.01131707, + "auxiliary_loss_mlp": 0.01103904, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00043011, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 2.113630620300756, + "language_loss": 0.76168251, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78403854, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.618305206298828 + }, + { + "auxiliary_loss_clip": 0.01083497, + "auxiliary_loss_mlp": 0.0107936, + "balance_loss_clip": 1.00119007, + "balance_loss_mlp": 1.00001478, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.778308851791991, + "language_loss": 0.57585776, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59748638, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.118572950363159 + }, + { + "auxiliary_loss_clip": 0.01148716, + "auxiliary_loss_mlp": 0.01105067, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00044918, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.219138528950463, + "language_loss": 0.67294097, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.6954788, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.535745620727539 + }, + { + "auxiliary_loss_clip": 0.01164934, + "auxiliary_loss_mlp": 0.0110291, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00039029, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 1.8657235464609583, + "language_loss": 0.71584606, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.7385245, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.4965810775756836 + }, + { + "auxiliary_loss_clip": 0.01118364, + "auxiliary_loss_mlp": 0.01103931, + "balance_loss_clip": 1.0017364, + "balance_loss_mlp": 1.00045729, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.745128115416712, + "language_loss": 0.73758775, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75981069, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.672957181930542 + }, + { + "auxiliary_loss_clip": 0.01102337, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00145304, + "balance_loss_mlp": 1.00033104, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 5.600850936781897, + "language_loss": 0.7219286, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74042499, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.6337544918060303 + }, + { + "auxiliary_loss_clip": 0.01148617, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.00046849, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 1.8584714325202742, + "language_loss": 0.70747399, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.72999674, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.529858112335205 + }, + { + "auxiliary_loss_clip": 0.01165105, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_clip": 1.00199425, + "balance_loss_mlp": 1.00038838, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 2.799453228268832, + "language_loss": 0.75831854, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78099966, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 2.5176970958709717 + }, + { + "auxiliary_loss_clip": 0.0115031, + "auxiliary_loss_mlp": 0.01102819, + "balance_loss_clip": 1.00176632, + "balance_loss_mlp": 1.00039387, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6242364902129993, + "language_loss": 0.73121876, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75375003, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.54496169090271 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_clip": 1.00150061, + "balance_loss_mlp": 1.00029826, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 1.997698478420539, + "language_loss": 0.81508029, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83725369, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.590700626373291 + }, + { + "auxiliary_loss_clip": 0.01131259, + "auxiliary_loss_mlp": 0.01104324, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00046849, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 1.8128737232540098, + "language_loss": 0.81765962, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.84001547, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 2.5266597270965576 + }, + { + "auxiliary_loss_clip": 0.0116494, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00057268, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.654892105656378, + "language_loss": 0.57940078, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60208106, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.5289976596832275 + }, + { + "auxiliary_loss_clip": 0.01165206, + "auxiliary_loss_mlp": 0.01103967, + "balance_loss_clip": 1.00208342, + "balance_loss_mlp": 1.00058889, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.85637466794101, + "language_loss": 0.63400292, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65669471, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.5246105194091797 + }, + { + "auxiliary_loss_clip": 0.01148558, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_clip": 1.00187874, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.7708900007596917, + "language_loss": 0.72975612, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75227892, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01103551, + "balance_loss_clip": 1.00199854, + "balance_loss_mlp": 1.0004586, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.670718795792291, + "language_loss": 0.69490337, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71739531, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.575068950653076 + }, + { + "auxiliary_loss_clip": 0.01133958, + "auxiliary_loss_mlp": 0.00747534, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00039744, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.261808583300411, + "language_loss": 0.64939266, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.66820753, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.542194366455078 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00051951, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.808776179580133, + "language_loss": 0.7871716, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.80939823, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 4.141986846923828 + }, + { + "auxiliary_loss_clip": 0.01129212, + "auxiliary_loss_mlp": 0.01079681, + "balance_loss_clip": 1.00119019, + "balance_loss_mlp": 0.99995393, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7284026403567766, + "language_loss": 0.53564632, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.5577352, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 3.2129499912261963 + }, + { + "auxiliary_loss_clip": 0.01148501, + "auxiliary_loss_mlp": 0.0110373, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00044656, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.7148815960364652, + "language_loss": 0.82334411, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84586644, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.5990400314331055 + }, + { + "auxiliary_loss_clip": 0.01100623, + "auxiliary_loss_mlp": 0.01102243, + "balance_loss_clip": 1.00159109, + "balance_loss_mlp": 1.00058091, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 5.817165083951747, + "language_loss": 0.81436503, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83639371, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.66878342628479 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01103705, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00061274, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 3.8487336601896995, + "language_loss": 0.71868646, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.74107683, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 2.6062214374542236 + }, + { + "auxiliary_loss_clip": 0.01148342, + "auxiliary_loss_mlp": 0.01101945, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00037849, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3049323910533088, + "language_loss": 0.75226808, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77477086, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.5815610885620117 + }, + { + "auxiliary_loss_clip": 0.01164857, + "auxiliary_loss_mlp": 0.01102371, + "balance_loss_clip": 1.00182498, + "balance_loss_mlp": 1.0005188, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.517140740298459, + "language_loss": 0.7009933, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72366548, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.509775161743164 + }, + { + "auxiliary_loss_clip": 0.01131351, + "auxiliary_loss_mlp": 0.01102972, + "balance_loss_clip": 1.00157428, + "balance_loss_mlp": 1.0004518, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.9016649689867553, + "language_loss": 0.70391357, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.72625685, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.599390745162964 + }, + { + "auxiliary_loss_clip": 0.01150435, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00049543, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.5706054579560416, + "language_loss": 0.76820934, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79074007, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.515279769897461 + }, + { + "auxiliary_loss_clip": 0.01148597, + "auxiliary_loss_mlp": 0.01102307, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00055027, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.6835167361925794, + "language_loss": 0.73661554, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75912452, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.539740800857544 + }, + { + "auxiliary_loss_clip": 0.01132166, + "auxiliary_loss_mlp": 0.01103702, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.0003233, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 2.801625860013864, + "language_loss": 0.69540429, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71776295, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 2.5990869998931885 + }, + { + "auxiliary_loss_clip": 0.01164999, + "auxiliary_loss_mlp": 0.01102841, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00051212, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 2.019608531274786, + "language_loss": 0.79879451, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82147288, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.5029361248016357 + }, + { + "auxiliary_loss_clip": 0.01102981, + "auxiliary_loss_mlp": 0.01103329, + "balance_loss_clip": 1.00163484, + "balance_loss_mlp": 1.00033176, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 1.7418820554523662, + "language_loss": 0.77759212, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.7996552, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 4.089727878570557 + }, + { + "auxiliary_loss_clip": 0.01111292, + "auxiliary_loss_mlp": 0.00746578, + "balance_loss_clip": 1.00075221, + "balance_loss_mlp": 1.00128114, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.803114300555595, + "language_loss": 0.60239697, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62097567, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.1025543212890625 + }, + { + "auxiliary_loss_clip": 0.01148062, + "auxiliary_loss_mlp": 0.01102252, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.00049508, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 2.658234608651193, + "language_loss": 0.83126533, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85376847, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.5483200550079346 + }, + { + "auxiliary_loss_clip": 0.01164993, + "auxiliary_loss_mlp": 0.01103335, + "balance_loss_clip": 1.00188959, + "balance_loss_mlp": 1.00052857, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 1.99564212248645, + "language_loss": 0.73074806, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75343138, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 3.8846397399902344 + }, + { + "auxiliary_loss_clip": 0.01147626, + "auxiliary_loss_mlp": 0.01102316, + "balance_loss_clip": 1.00172472, + "balance_loss_mlp": 1.00036871, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.8053832632373117, + "language_loss": 0.64448941, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66698885, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.613737106323242 + }, + { + "auxiliary_loss_clip": 0.01120726, + "auxiliary_loss_mlp": 0.01103047, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.00062275, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.676499423525799, + "language_loss": 0.66480643, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68704414, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.702141046524048 + }, + { + "auxiliary_loss_clip": 0.01150237, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00177014, + "balance_loss_mlp": 1.00040221, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.707137055134909, + "language_loss": 0.65952247, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68204355, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 3.9513041973114014 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01103006, + "balance_loss_clip": 1.00179005, + "balance_loss_mlp": 1.00058186, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.8064937070055735, + "language_loss": 0.77533948, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79772234, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.559688091278076 + }, + { + "auxiliary_loss_clip": 0.01164958, + "auxiliary_loss_mlp": 0.01102724, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00039506, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.6518780479383504, + "language_loss": 0.75298893, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77566576, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 2.5040340423583984 + }, + { + "auxiliary_loss_clip": 0.01082569, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00049126, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.1303738826503653, + "language_loss": 0.75940239, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.78126109, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 2.6919140815734863 + }, + { + "auxiliary_loss_clip": 0.01148512, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00042939, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.0121901769636583, + "language_loss": 0.80847788, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.8309896, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.813915252685547 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01101936, + "balance_loss_clip": 1.00176418, + "balance_loss_mlp": 1.00046539, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.43602515102262, + "language_loss": 0.78622055, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80840182, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.6261768341064453 + }, + { + "auxiliary_loss_clip": 0.01133331, + "auxiliary_loss_mlp": 0.01103675, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00058246, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 4.711289575811425, + "language_loss": 0.68402982, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70639986, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.600950002670288 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01103299, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00049305, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 2.7882520162132716, + "language_loss": 0.86027628, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88264471, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.5570015907287598 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.01103831, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00054836, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 4.300249471780651, + "language_loss": 0.7240001, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74622619, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.6252243518829346 + }, + { + "auxiliary_loss_clip": 0.01149868, + "auxiliary_loss_mlp": 0.01103694, + "balance_loss_clip": 1.00172484, + "balance_loss_mlp": 1.00050676, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.6953856429882974, + "language_loss": 0.71285266, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73538828, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.5385959148406982 + }, + { + "auxiliary_loss_clip": 0.01133808, + "auxiliary_loss_mlp": 0.0110335, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00035322, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.7754230558575095, + "language_loss": 0.84337473, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86574632, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.537447452545166 + }, + { + "auxiliary_loss_clip": 0.01135295, + "auxiliary_loss_mlp": 0.01102688, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00035918, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.6878538103987546, + "language_loss": 0.78431857, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80669844, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.622239828109741 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01102451, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00050342, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.7334733573554502, + "language_loss": 0.73417258, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75636756, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.622159242630005 + }, + { + "auxiliary_loss_clip": 0.01164974, + "auxiliary_loss_mlp": 0.01103384, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.00048208, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.5791266133038213, + "language_loss": 0.73280025, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75548387, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.468597412109375 + }, + { + "auxiliary_loss_clip": 0.01131882, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_clip": 1.00183237, + "balance_loss_mlp": 1.00044608, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.8236664262669655, + "language_loss": 0.74207515, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76443028, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.5626327991485596 + }, + { + "auxiliary_loss_clip": 0.01150419, + "auxiliary_loss_mlp": 0.01103925, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.0003562, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 2.250764174059354, + "language_loss": 0.73151302, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75405645, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.522430658340454 + }, + { + "auxiliary_loss_clip": 0.01116381, + "auxiliary_loss_mlp": 0.01102253, + "balance_loss_clip": 1.00171053, + "balance_loss_mlp": 1.00059128, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.5682576978099758, + "language_loss": 0.69098103, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71316731, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.620877981185913 + }, + { + "auxiliary_loss_clip": 0.01148096, + "auxiliary_loss_mlp": 0.01101825, + "balance_loss_clip": 1.0017668, + "balance_loss_mlp": 1.00044942, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.396794693076408, + "language_loss": 0.72977221, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75227141, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.5969760417938232 + }, + { + "auxiliary_loss_clip": 0.01164831, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00045979, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.7457848715649715, + "language_loss": 0.72108781, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74376678, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.482820987701416 + }, + { + "auxiliary_loss_clip": 0.01133728, + "auxiliary_loss_mlp": 0.01102992, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00037682, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 2.340773094145272, + "language_loss": 0.69235718, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71472442, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.5817105770111084 + }, + { + "auxiliary_loss_clip": 0.01164969, + "auxiliary_loss_mlp": 0.01101392, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00049317, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 1.8841383360127715, + "language_loss": 0.70553958, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72820324, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 3.9372243881225586 + }, + { + "auxiliary_loss_clip": 0.01131217, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.00192952, + "balance_loss_mlp": 1.00054193, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.6335005214377851, + "language_loss": 0.69708353, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71943009, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.5658226013183594 + }, + { + "auxiliary_loss_clip": 0.01147851, + "auxiliary_loss_mlp": 0.01102913, + "balance_loss_clip": 1.00180888, + "balance_loss_mlp": 1.00039256, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 2.070051565621204, + "language_loss": 0.69076848, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71327615, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.574493408203125 + }, + { + "auxiliary_loss_clip": 0.01164844, + "auxiliary_loss_mlp": 0.01101682, + "balance_loss_clip": 1.00179374, + "balance_loss_mlp": 1.00049686, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.7056654303218126, + "language_loss": 0.72002482, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74269009, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.460082769393921 + }, + { + "auxiliary_loss_clip": 0.01150259, + "auxiliary_loss_mlp": 0.01103034, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00051367, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.4749606173274394, + "language_loss": 0.74361885, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76615179, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.555403232574463 + }, + { + "auxiliary_loss_clip": 0.01165018, + "auxiliary_loss_mlp": 0.01103886, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00041223, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.0772048668297294, + "language_loss": 0.72800517, + "learning_rate": 9.749020425753251e-08, + "loss": 0.75069422, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.502140760421753 + }, + { + "auxiliary_loss_clip": 0.01117675, + "auxiliary_loss_mlp": 0.01101458, + "balance_loss_clip": 1.00175238, + "balance_loss_mlp": 1.000368, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.6073957823526723, + "language_loss": 0.71967083, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74186218, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.6563291549682617 + }, + { + "auxiliary_loss_clip": 0.01148334, + "auxiliary_loss_mlp": 0.01103238, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00043225, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.6593680244804179, + "language_loss": 0.82553327, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84804904, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 2.5853400230407715 + }, + { + "auxiliary_loss_clip": 0.01148208, + "auxiliary_loss_mlp": 0.01102694, + "balance_loss_clip": 1.00175917, + "balance_loss_mlp": 1.00036442, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.419168556513117, + "language_loss": 0.69847143, + "learning_rate": 9.713019223328966e-08, + "loss": 0.72098047, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.5631556510925293 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01101675, + "balance_loss_clip": 1.00162268, + "balance_loss_mlp": 1.00058556, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.5299444320838493, + "language_loss": 0.76487565, + "learning_rate": 9.70103325331717e-08, + "loss": 0.78704119, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.663743257522583 + }, + { + "auxiliary_loss_clip": 0.01148308, + "auxiliary_loss_mlp": 0.0110228, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00042748, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.7684071839733446, + "language_loss": 0.68462598, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70713186, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.533980369567871 + }, + { + "auxiliary_loss_clip": 0.01103103, + "auxiliary_loss_mlp": 0.0110156, + "balance_loss_clip": 1.00169075, + "balance_loss_mlp": 1.0004704, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.6324908123841197, + "language_loss": 0.7596361, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78168273, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.676978349685669 + }, + { + "auxiliary_loss_clip": 0.01097733, + "auxiliary_loss_mlp": 0.01102686, + "balance_loss_clip": 1.00169873, + "balance_loss_mlp": 1.00054801, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.7695002742383215, + "language_loss": 0.69224691, + "learning_rate": 9.665118642033765e-08, + "loss": 0.7142511, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 4.133347034454346 + }, + { + "auxiliary_loss_clip": 0.01148663, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_clip": 1.00188267, + "balance_loss_mlp": 1.0005157, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 2.2099041318440786, + "language_loss": 0.73760802, + "learning_rate": 9.653161539369858e-08, + "loss": 0.76013833, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.526735544204712 + }, + { + "auxiliary_loss_clip": 0.01149443, + "auxiliary_loss_mlp": 0.0110254, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00049686, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.332814451818612, + "language_loss": 0.68096495, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70348471, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.6925508975982666 + }, + { + "auxiliary_loss_clip": 0.01131816, + "auxiliary_loss_mlp": 0.01101912, + "balance_loss_clip": 1.00168383, + "balance_loss_mlp": 1.00034523, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.4985149093562538, + "language_loss": 0.76433963, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78667688, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 4.0118255615234375 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01103449, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00045228, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.7176314474070031, + "language_loss": 0.75405645, + "learning_rate": 9.617333541017502e-08, + "loss": 0.7767415, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.4689395427703857 + }, + { + "auxiliary_loss_clip": 0.01120336, + "auxiliary_loss_mlp": 0.01102345, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00058746, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.5737405616249334, + "language_loss": 0.73881727, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76104403, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 2.6637210845947266 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01102972, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00045252, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.6265959938724166, + "language_loss": 0.63431406, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65653121, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 4.07574725151062 + }, + { + "auxiliary_loss_clip": 0.01165063, + "auxiliary_loss_mlp": 0.01103285, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.00057459, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 1.932225805759855, + "language_loss": 0.61572266, + "learning_rate": 9.581570516631643e-08, + "loss": 0.63840616, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.531867265701294 + }, + { + "auxiliary_loss_clip": 0.0109574, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00040936, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.6139778755157685, + "language_loss": 0.82381833, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84578973, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.656196117401123 + }, + { + "auxiliary_loss_clip": 0.01164993, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.00184333, + "balance_loss_mlp": 1.00046682, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.6619025200586135, + "language_loss": 0.67758954, + "learning_rate": 9.557764603050667e-08, + "loss": 0.70027602, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.4900455474853516 + }, + { + "auxiliary_loss_clip": 0.01132886, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_clip": 1.0016911, + "balance_loss_mlp": 1.0005461, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.758859613765506, + "language_loss": 0.75143611, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77379274, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.547314167022705 + }, + { + "auxiliary_loss_clip": 0.01131469, + "auxiliary_loss_mlp": 0.01103018, + "balance_loss_clip": 1.00185788, + "balance_loss_mlp": 1.00049782, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.511400157220488, + "language_loss": 0.70193714, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72428203, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.581523895263672 + }, + { + "auxiliary_loss_clip": 0.01118008, + "auxiliary_loss_mlp": 0.01102633, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 1.00030422, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.6386665595249312, + "language_loss": 0.67800456, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70021105, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.640622615814209 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01102334, + "balance_loss_clip": 1.0017451, + "balance_loss_mlp": 1.00048137, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 2.032215303821393, + "language_loss": 0.57565171, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59815669, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.6530725955963135 + }, + { + "auxiliary_loss_clip": 0.01128743, + "auxiliary_loss_mlp": 0.00746321, + "balance_loss_clip": 1.00103092, + "balance_loss_mlp": 1.00086558, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7759218639443886, + "language_loss": 0.56928498, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58803564, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.1229777336120605 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01103045, + "balance_loss_clip": 1.00169539, + "balance_loss_mlp": 1.00042951, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.7166219019043834, + "language_loss": 0.69835871, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72070342, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 2.5683341026306152 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01103181, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00056565, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.4965948666806246, + "language_loss": 0.69975483, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72211313, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.5449624061584473 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01103912, + "balance_loss_clip": 1.00172079, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 1.768362487644051, + "language_loss": 0.656093, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67829806, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.6230878829956055 + }, + { + "auxiliary_loss_clip": 0.0111483, + "auxiliary_loss_mlp": 0.01103343, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00053644, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.0343294111126404, + "language_loss": 0.62094605, + "learning_rate": 9.450995512600379e-08, + "loss": 0.6431278, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.5610926151275635 + }, + { + "auxiliary_loss_clip": 0.01165112, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.0004313, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.4084905530872551, + "language_loss": 0.71232986, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73145503, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 2.563432455062866 + }, + { + "auxiliary_loss_clip": 0.01148116, + "auxiliary_loss_mlp": 0.01102237, + "balance_loss_clip": 1.00164199, + "balance_loss_mlp": 1.00043225, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 3.0109159797178484, + "language_loss": 0.7577647, + "learning_rate": 9.427348518535483e-08, + "loss": 0.78026819, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.497880220413208 + }, + { + "auxiliary_loss_clip": 0.01148174, + "auxiliary_loss_mlp": 0.01101865, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.00048995, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.606950199575654, + "language_loss": 0.75539148, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77789187, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.5538394451141357 + }, + { + "auxiliary_loss_clip": 0.01165083, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00035512, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.5479202967271763, + "language_loss": 0.82277983, + "learning_rate": 9.403730430606472e-08, + "loss": 0.8419044, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.537752866744995 + }, + { + "auxiliary_loss_clip": 0.01148293, + "auxiliary_loss_mlp": 0.01103036, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00051582, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.3979915244849583, + "language_loss": 0.89349675, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91601002, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.5564637184143066 + }, + { + "auxiliary_loss_clip": 0.01150529, + "auxiliary_loss_mlp": 0.01103518, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.0004257, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.2528891566302898, + "language_loss": 0.76925564, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79179609, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.512667655944824 + }, + { + "auxiliary_loss_clip": 0.01148334, + "auxiliary_loss_mlp": 0.01103333, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00052667, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.473334958250902, + "language_loss": 0.73121798, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75373471, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.5967514514923096 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00037479, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.6629244643361696, + "language_loss": 0.8309232, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85297102, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 4.086555480957031 + }, + { + "auxiliary_loss_clip": 0.01147688, + "auxiliary_loss_mlp": 0.0110248, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00053287, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.5261113215984479, + "language_loss": 0.85257202, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87507367, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.543242931365967 + }, + { + "auxiliary_loss_clip": 0.01130666, + "auxiliary_loss_mlp": 0.01103096, + "balance_loss_clip": 1.00189817, + "balance_loss_mlp": 1.00038552, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.7913113945149342, + "language_loss": 0.71549565, + "learning_rate": 9.333049639436863e-08, + "loss": 0.73783332, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.6251113414764404 + }, + { + "auxiliary_loss_clip": 0.01150411, + "auxiliary_loss_mlp": 0.01102519, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00047612, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 2.0744086674790405, + "language_loss": 0.81004238, + "learning_rate": 9.321294810356418e-08, + "loss": 0.83257163, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 2.5469117164611816 + }, + { + "auxiliary_loss_clip": 0.0114326, + "auxiliary_loss_mlp": 0.01079417, + "balance_loss_clip": 1.00103259, + "balance_loss_mlp": 1.00007105, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6699633977517359, + "language_loss": 0.51350087, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53572762, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.2197155952453613 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01103302, + "balance_loss_clip": 1.00168324, + "balance_loss_mlp": 1.0004003, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.6908397020814887, + "language_loss": 0.67016804, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69221413, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.63405704498291 + }, + { + "auxiliary_loss_clip": 0.01131564, + "auxiliary_loss_mlp": 0.01102926, + "balance_loss_clip": 1.00168705, + "balance_loss_mlp": 1.00040638, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.2913203942602896, + "language_loss": 0.63950408, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66184902, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.544720411300659 + }, + { + "auxiliary_loss_clip": 0.01133531, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00054359, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.7932341914247008, + "language_loss": 0.71958488, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74195468, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.5589606761932373 + }, + { + "auxiliary_loss_clip": 0.01164817, + "auxiliary_loss_mlp": 0.01101583, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.0003978, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.5327368988087717, + "language_loss": 0.70499271, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72765672, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.482475519180298 + }, + { + "auxiliary_loss_clip": 0.01114647, + "auxiliary_loss_mlp": 0.01101984, + "balance_loss_clip": 1.00188506, + "balance_loss_mlp": 1.00041747, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.6172206191181882, + "language_loss": 0.72212833, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74429464, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 2.658447504043579 + }, + { + "auxiliary_loss_clip": 0.01149394, + "auxiliary_loss_mlp": 0.01103091, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00057101, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.8652724301206516, + "language_loss": 0.69575626, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71828115, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.589069366455078 + }, + { + "auxiliary_loss_clip": 0.01131242, + "auxiliary_loss_mlp": 0.01103125, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00060463, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.6075797352852725, + "language_loss": 0.62753755, + "learning_rate": 9.227516515099743e-08, + "loss": 0.64988124, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.569375991821289 + }, + { + "auxiliary_loss_clip": 0.01065965, + "auxiliary_loss_mlp": 0.01103724, + "balance_loss_clip": 1.00159395, + "balance_loss_mlp": 1.00044096, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 2.0605287599195674, + "language_loss": 0.80134428, + "learning_rate": 9.215826777033675e-08, + "loss": 0.8230412, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 4.138226270675659 + }, + { + "auxiliary_loss_clip": 0.01131555, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_clip": 1.00175011, + "balance_loss_mlp": 1.00055599, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.5282796705389565, + "language_loss": 0.70321167, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72556752, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.5490145683288574 + }, + { + "auxiliary_loss_clip": 0.01164832, + "auxiliary_loss_mlp": 0.01101733, + "balance_loss_clip": 1.00180924, + "balance_loss_mlp": 1.00050092, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 2.239416495730513, + "language_loss": 0.85430133, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87696695, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.470341682434082 + }, + { + "auxiliary_loss_clip": 0.01150364, + "auxiliary_loss_mlp": 0.01103686, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00040317, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 2.589720557722134, + "language_loss": 0.59200102, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61454153, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 4.207663536071777 + }, + { + "auxiliary_loss_clip": 0.01115158, + "auxiliary_loss_mlp": 0.01103665, + "balance_loss_clip": 1.00143576, + "balance_loss_mlp": 1.00028682, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 1.9295951764212964, + "language_loss": 0.81444764, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83663583, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.5809452533721924 + }, + { + "auxiliary_loss_clip": 0.01165161, + "auxiliary_loss_mlp": 0.01104051, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00048161, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.9314209137128426, + "language_loss": 0.62006783, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64275998, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 3.9483907222747803 + }, + { + "auxiliary_loss_clip": 0.01133565, + "auxiliary_loss_mlp": 0.01101634, + "balance_loss_clip": 1.00174391, + "balance_loss_mlp": 1.00044918, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.783672706491963, + "language_loss": 0.72844577, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75079775, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 2.7429885864257812 + }, + { + "auxiliary_loss_clip": 0.01147829, + "auxiliary_loss_mlp": 0.01102237, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00038493, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.8499774444841517, + "language_loss": 0.8078109, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83031154, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.5001585483551025 + }, + { + "auxiliary_loss_clip": 0.01081665, + "auxiliary_loss_mlp": 0.00746186, + "balance_loss_clip": 1.00098455, + "balance_loss_mlp": 1.00093925, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.745630135496712, + "language_loss": 0.52348095, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54175943, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.347669839859009 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01079364, + "balance_loss_clip": 1.00133204, + "balance_loss_mlp": 1.00001812, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7277162436613284, + "language_loss": 0.62098515, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64290643, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.0845165252685547 + }, + { + "auxiliary_loss_clip": 0.01148322, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00053215, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.7476185714955186, + "language_loss": 0.82537842, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84789407, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.5259909629821777 + }, + { + "auxiliary_loss_clip": 0.01134689, + "auxiliary_loss_mlp": 0.00747243, + "balance_loss_clip": 1.00172198, + "balance_loss_mlp": 1.00036275, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.6258797823531483, + "language_loss": 0.83992237, + "learning_rate": 9.08771723625934e-08, + "loss": 0.85874176, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.588671922683716 + }, + { + "auxiliary_loss_clip": 0.01147981, + "auxiliary_loss_mlp": 0.0074731, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00035191, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.5300808261104575, + "language_loss": 0.65487629, + "learning_rate": 9.076114342030617e-08, + "loss": 0.6738292, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.681886672973633 + }, + { + "auxiliary_loss_clip": 0.01063541, + "auxiliary_loss_mlp": 0.01101931, + "balance_loss_clip": 1.00160491, + "balance_loss_mlp": 1.00026917, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.5935404225303262, + "language_loss": 0.70994133, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73159611, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 2.9355552196502686 + }, + { + "auxiliary_loss_clip": 0.01148774, + "auxiliary_loss_mlp": 0.01104926, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00049865, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.3652074464848756, + "language_loss": 0.71038646, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73292351, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.4989099502563477 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01102665, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00052655, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 2.885502873714701, + "language_loss": 0.74575168, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76811594, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.5991079807281494 + }, + { + "auxiliary_loss_clip": 0.01114225, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00042367, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.7775561309049683, + "language_loss": 0.77956963, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80173278, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.647777795791626 + }, + { + "auxiliary_loss_clip": 0.01131442, + "auxiliary_loss_mlp": 0.00747272, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00036907, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.586812979685528, + "language_loss": 0.68794763, + "learning_rate": 9.01820847747028e-08, + "loss": 0.70673478, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.592956066131592 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01102854, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00042915, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 2.607635879061354, + "language_loss": 0.66797376, + "learning_rate": 9.006649028948965e-08, + "loss": 0.69065148, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.5393221378326416 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01080067, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 0.99995786, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7821082667835263, + "language_loss": 0.61309087, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63504738, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.2089362144470215 + }, + { + "auxiliary_loss_clip": 0.01150244, + "auxiliary_loss_mlp": 0.0110302, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00059509, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 2.0631782082864993, + "language_loss": 0.72145414, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74398679, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.5500197410583496 + }, + { + "auxiliary_loss_clip": 0.01132949, + "auxiliary_loss_mlp": 0.01102394, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00035083, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 1.9267444743077249, + "language_loss": 0.7625705, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78492391, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.5481886863708496 + }, + { + "auxiliary_loss_clip": 0.01133907, + "auxiliary_loss_mlp": 0.01102605, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00037086, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 2.1525714201698416, + "language_loss": 0.73452657, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75689173, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.5853419303894043 + }, + { + "auxiliary_loss_clip": 0.01164818, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_clip": 1.00186455, + "balance_loss_mlp": 1.00046062, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 2.400505211706251, + "language_loss": 0.75684142, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77951175, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.5218665599823 + }, + { + "auxiliary_loss_clip": 0.01129397, + "auxiliary_loss_mlp": 0.01103529, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00053215, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.1522745064872755, + "language_loss": 0.77857471, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80090398, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.568902015686035 + }, + { + "auxiliary_loss_clip": 0.01133292, + "auxiliary_loss_mlp": 0.01100568, + "balance_loss_clip": 1.00170243, + "balance_loss_mlp": 1.00033641, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 3.353845058356763, + "language_loss": 0.86148143, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88382, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 4.234505653381348 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01103133, + "balance_loss_clip": 1.00177681, + "balance_loss_mlp": 1.00042188, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.3903496801849884, + "language_loss": 0.78772932, + "learning_rate": 8.914434207073296e-08, + "loss": 0.81007087, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.620187997817993 + }, + { + "auxiliary_loss_clip": 0.01145886, + "auxiliary_loss_mlp": 0.01079395, + "balance_loss_clip": 1.00116897, + "balance_loss_mlp": 1.00004947, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7326440326873918, + "language_loss": 0.57038462, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59263742, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 3.060547113418579 + }, + { + "auxiliary_loss_clip": 0.01150356, + "auxiliary_loss_mlp": 0.01104075, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00050569, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 2.052184868351123, + "language_loss": 0.7138617, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73640597, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 2.541754722595215 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_clip": 1.00190222, + "balance_loss_mlp": 1.0004791, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.6941579677832943, + "language_loss": 0.73874015, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76095909, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.5876855850219727 + }, + { + "auxiliary_loss_clip": 0.01165195, + "auxiliary_loss_mlp": 0.01103275, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.7231004141266324, + "language_loss": 0.56635863, + "learning_rate": 8.868500685768898e-08, + "loss": 0.58904332, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.565171480178833 + }, + { + "auxiliary_loss_clip": 0.01145467, + "auxiliary_loss_mlp": 0.01102338, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00029445, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.525130090785859, + "language_loss": 0.79861081, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82108879, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.547295570373535 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.00747399, + "balance_loss_clip": 1.00161743, + "balance_loss_mlp": 1.00039887, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 2.161779037456172, + "language_loss": 0.66256011, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68103838, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.6969337463378906 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01103148, + "balance_loss_clip": 1.00174773, + "balance_loss_mlp": 1.00062764, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 1.913348705603124, + "language_loss": 0.70304501, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72541177, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.5632314682006836 + }, + { + "auxiliary_loss_clip": 0.01143874, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_clip": 1.00112402, + "balance_loss_mlp": 0.99999887, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6199275445601462, + "language_loss": 0.53468329, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55691546, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.1811749935150146 + }, + { + "auxiliary_loss_clip": 0.01118768, + "auxiliary_loss_mlp": 0.01103233, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00042701, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.6266368130573663, + "language_loss": 0.68344569, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70566571, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.6304798126220703 + }, + { + "auxiliary_loss_clip": 0.01150339, + "auxiliary_loss_mlp": 0.01102254, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00049686, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.9046883497277107, + "language_loss": 0.79323393, + "learning_rate": 8.799817844260049e-08, + "loss": 0.8157599, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.5442657470703125 + }, + { + "auxiliary_loss_clip": 0.0113385, + "auxiliary_loss_mlp": 0.0110221, + "balance_loss_clip": 1.0018574, + "balance_loss_mlp": 1.000453, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.694125458724813, + "language_loss": 0.71413255, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73649311, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 4.11736273765564 + }, + { + "auxiliary_loss_clip": 0.0113333, + "auxiliary_loss_mlp": 0.01102249, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00039613, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.9580121697718964, + "language_loss": 0.7732898, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79564553, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.619922161102295 + }, + { + "auxiliary_loss_clip": 0.01164812, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.0017612, + "balance_loss_mlp": 1.00043654, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.8470848288168882, + "language_loss": 0.7449106, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76403284, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 3.903878688812256 + }, + { + "auxiliary_loss_clip": 0.0110449, + "auxiliary_loss_mlp": 0.01103999, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00043046, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.6520482994576107, + "language_loss": 0.80327415, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82535905, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.6695163249969482 + }, + { + "auxiliary_loss_clip": 0.01127352, + "auxiliary_loss_mlp": 0.01079405, + "balance_loss_clip": 1.00149536, + "balance_loss_mlp": 1.00005901, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8155932488217326, + "language_loss": 0.59682941, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61889702, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.1275246143341064 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.00168896, + "balance_loss_mlp": 1.00030565, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.8447167930158739, + "language_loss": 0.73621786, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75857604, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.6743264198303223 + }, + { + "auxiliary_loss_clip": 0.01116765, + "auxiliary_loss_mlp": 0.01101945, + "balance_loss_clip": 1.00161839, + "balance_loss_mlp": 1.00037837, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.8866444344495794, + "language_loss": 0.71661764, + "learning_rate": 8.720017759045073e-08, + "loss": 0.7388047, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.6410953998565674 + }, + { + "auxiliary_loss_clip": 0.01135361, + "auxiliary_loss_mlp": 0.01103058, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00053835, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.7393631426588765, + "language_loss": 0.68973726, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71212149, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 4.032761812210083 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.01079383, + "balance_loss_clip": 1.00108361, + "balance_loss_mlp": 1.00003779, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6981977325287808, + "language_loss": 0.51723844, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53925407, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.189647912979126 + }, + { + "auxiliary_loss_clip": 0.011497, + "auxiliary_loss_mlp": 0.01102836, + "balance_loss_clip": 1.00170434, + "balance_loss_mlp": 1.0006026, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.687203978811028, + "language_loss": 0.69950807, + "learning_rate": 8.685926514226837e-08, + "loss": 0.7220335, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.5263712406158447 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01102371, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00051844, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 1.8882648333682406, + "language_loss": 0.79521644, + "learning_rate": 8.674577274677508e-08, + "loss": 0.8177225, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.627058506011963 + }, + { + "auxiliary_loss_clip": 0.01099792, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00044346, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 2.136492403345387, + "language_loss": 0.70217395, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72420532, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.6557955741882324 + }, + { + "auxiliary_loss_clip": 0.01121345, + "auxiliary_loss_mlp": 0.01103826, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00035286, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.9139375301918236, + "language_loss": 0.65633959, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67859131, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.6272482872009277 + }, + { + "auxiliary_loss_clip": 0.01164866, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00054097, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.6351168244942025, + "language_loss": 0.69108003, + "learning_rate": 8.640573088224812e-08, + "loss": 0.7137574, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.519129514694214 + }, + { + "auxiliary_loss_clip": 0.01114471, + "auxiliary_loss_mlp": 0.01102818, + "balance_loss_clip": 1.00153923, + "balance_loss_mlp": 1.00039387, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.4122591262045026, + "language_loss": 0.74375188, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76592481, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.6657228469848633 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.01104531, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00048518, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 1.902849672374429, + "language_loss": 0.72884136, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75122672, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.5708882808685303 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01105001, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00047839, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.6856853751667025, + "language_loss": 0.71827412, + "learning_rate": 8.60663420908827e-08, + "loss": 0.74049586, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.5797817707061768 + }, + { + "auxiliary_loss_clip": 0.01165014, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00043797, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 1.9982764722925923, + "language_loss": 0.65481293, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67393672, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.5333335399627686 + }, + { + "auxiliary_loss_clip": 0.0114975, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.00176966, + "balance_loss_mlp": 1.00055575, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.790688607694084, + "language_loss": 0.70124483, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72377789, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.8080883026123047 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_clip": 1.00156951, + "balance_loss_mlp": 1.00035119, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.2280688700939182, + "language_loss": 0.74697828, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76902568, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.6728227138519287 + }, + { + "auxiliary_loss_clip": 0.01147997, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_clip": 1.0017091, + "balance_loss_mlp": 1.00034881, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 2.152042494793062, + "language_loss": 0.75739872, + "learning_rate": 8.561483979414253e-08, + "loss": 0.77989686, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.6013545989990234 + }, + { + "auxiliary_loss_clip": 0.01150512, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00047469, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 5.570023021255408, + "language_loss": 0.72399849, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74653357, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.5360283851623535 + }, + { + "auxiliary_loss_clip": 0.01117335, + "auxiliary_loss_mlp": 0.01103284, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.0005734, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.6224383411334145, + "language_loss": 0.79034781, + "learning_rate": 8.538952419072143e-08, + "loss": 0.812554, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.5831985473632812 + }, + { + "auxiliary_loss_clip": 0.0111438, + "auxiliary_loss_mlp": 0.01103467, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00056553, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.5813241721893156, + "language_loss": 0.75250536, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77468383, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.631629467010498 + }, + { + "auxiliary_loss_clip": 0.01064277, + "auxiliary_loss_mlp": 0.01102532, + "balance_loss_clip": 1.00168002, + "balance_loss_mlp": 1.00048923, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.8646433726199327, + "language_loss": 0.62136573, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64303386, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.8427486419677734 + }, + { + "auxiliary_loss_clip": 0.01116762, + "auxiliary_loss_mlp": 0.01101432, + "balance_loss_clip": 1.00161266, + "balance_loss_mlp": 1.00034297, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.677260230418012, + "language_loss": 0.76721191, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78939378, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 2.6510684490203857 + }, + { + "auxiliary_loss_clip": 0.01150262, + "auxiliary_loss_mlp": 0.01102974, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00045419, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 2.230827740985814, + "language_loss": 0.83480716, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85733956, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 4.063809394836426 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01103382, + "balance_loss_clip": 1.00187743, + "balance_loss_mlp": 1.00038552, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.625888578955458, + "language_loss": 0.75139767, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77376223, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.787511110305786 + }, + { + "auxiliary_loss_clip": 0.01133999, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00048041, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.093250372251498, + "language_loss": 0.59446394, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61682916, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.67641019821167 + }, + { + "auxiliary_loss_clip": 0.01112258, + "auxiliary_loss_mlp": 0.01103223, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00051188, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.3890574793937818, + "language_loss": 0.82575977, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84791458, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.6262879371643066 + }, + { + "auxiliary_loss_clip": 0.01133801, + "auxiliary_loss_mlp": 0.01103007, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00039172, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.8192500978980606, + "language_loss": 0.74018598, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76255417, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.6113064289093018 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01103895, + "balance_loss_clip": 1.00163865, + "balance_loss_mlp": 1.00042152, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.4162817760498485, + "language_loss": 0.72664648, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74887836, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.6347830295562744 + }, + { + "auxiliary_loss_clip": 0.01148245, + "auxiliary_loss_mlp": 0.01102337, + "balance_loss_clip": 1.00193441, + "balance_loss_mlp": 1.00048494, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.573833841288716, + "language_loss": 0.6965009, + "learning_rate": 8.426730298881702e-08, + "loss": 0.71900672, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.548588514328003 + }, + { + "auxiliary_loss_clip": 0.0110871, + "auxiliary_loss_mlp": 0.01079751, + "balance_loss_clip": 1.00106347, + "balance_loss_mlp": 1.00002432, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8213307834673178, + "language_loss": 0.59296221, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61484683, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 2.9637441635131836 + }, + { + "auxiliary_loss_clip": 0.01150309, + "auxiliary_loss_mlp": 0.01101926, + "balance_loss_clip": 1.00183666, + "balance_loss_mlp": 1.00045478, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 2.069684914828123, + "language_loss": 0.82256448, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84508681, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.550039529800415 + }, + { + "auxiliary_loss_clip": 0.01148064, + "auxiliary_loss_mlp": 0.01102327, + "balance_loss_clip": 1.00188982, + "balance_loss_mlp": 1.00037885, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.4439997309744277, + "language_loss": 0.81303263, + "learning_rate": 8.39320530080283e-08, + "loss": 0.8355366, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.58404541015625 + }, + { + "auxiliary_loss_clip": 0.01114632, + "auxiliary_loss_mlp": 0.01101782, + "balance_loss_clip": 1.00158882, + "balance_loss_mlp": 1.00050199, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.5850198528723896, + "language_loss": 0.775608, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79777217, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.6275887489318848 + }, + { + "auxiliary_loss_clip": 0.01164879, + "auxiliary_loss_mlp": 0.01102366, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00041771, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.5109655396481239, + "language_loss": 0.66458917, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68726158, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.6630213260650635 + }, + { + "auxiliary_loss_clip": 0.01147859, + "auxiliary_loss_mlp": 0.01102652, + "balance_loss_clip": 1.00178289, + "balance_loss_mlp": 1.00051332, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 2.594413460157638, + "language_loss": 0.74736971, + "learning_rate": 8.359745694462005e-08, + "loss": 0.76987481, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 3.9612507820129395 + }, + { + "auxiliary_loss_clip": 0.01118091, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00163412, + "balance_loss_mlp": 1.00055242, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.6589113129071704, + "language_loss": 0.64611822, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66832125, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.629648208618164 + }, + { + "auxiliary_loss_clip": 0.01164886, + "auxiliary_loss_mlp": 0.0110329, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00057912, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.8299176774696548, + "language_loss": 0.61009306, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63277483, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 4.017813205718994 + }, + { + "auxiliary_loss_clip": 0.01118536, + "auxiliary_loss_mlp": 0.01101691, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00041056, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.6018757679146975, + "language_loss": 0.71005726, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73225951, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.6358697414398193 + }, + { + "auxiliary_loss_clip": 0.0109764, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00162601, + "balance_loss_mlp": 1.00038457, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.540756126257507, + "language_loss": 0.7084505, + "learning_rate": 8.315234626222545e-08, + "loss": 0.73043877, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.7009503841400146 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01102425, + "balance_loss_clip": 1.00165188, + "balance_loss_mlp": 1.00047708, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 3.234576599584107, + "language_loss": 0.7321313, + "learning_rate": 8.304125029872233e-08, + "loss": 0.7544893, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.6052002906799316 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_clip": 1.00159037, + "balance_loss_mlp": 1.00049591, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.8439528722905956, + "language_loss": 0.79992992, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82211995, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 3.9827582836151123 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01102631, + "balance_loss_clip": 1.00153828, + "balance_loss_mlp": 1.00058794, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 1.9523548495576497, + "language_loss": 0.67993975, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70211363, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 2.6251943111419678 + }, + { + "auxiliary_loss_clip": 0.01164987, + "auxiliary_loss_mlp": 0.01101908, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00038886, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.6083176450108942, + "language_loss": 0.6332376, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65590656, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.533900260925293 + }, + { + "auxiliary_loss_clip": 0.01112172, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00053859, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 1.7770834364186505, + "language_loss": 0.72237235, + "learning_rate": 8.259759339947514e-08, + "loss": 0.7445237, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.590456962585449 + }, + { + "auxiliary_loss_clip": 0.01148139, + "auxiliary_loss_mlp": 0.01102548, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00040984, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.5325721641014578, + "language_loss": 0.6430403, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66554719, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.586252450942993 + }, + { + "auxiliary_loss_clip": 0.01133568, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00045419, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.1680127793560207, + "language_loss": 0.73198771, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75079727, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.592076063156128 + }, + { + "auxiliary_loss_clip": 0.01120926, + "auxiliary_loss_mlp": 0.01104146, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00048137, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 2.5832351136879863, + "language_loss": 0.71995682, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74220753, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.587454319000244 + }, + { + "auxiliary_loss_clip": 0.01132945, + "auxiliary_loss_mlp": 0.01103242, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00053155, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 2.038424563913943, + "language_loss": 0.82337278, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84573472, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.5810024738311768 + }, + { + "auxiliary_loss_clip": 0.01148169, + "auxiliary_loss_mlp": 0.01102774, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00044465, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.533122264249677, + "language_loss": 0.59810734, + "learning_rate": 8.204465823887252e-08, + "loss": 0.62061679, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.5549204349517822 + }, + { + "auxiliary_loss_clip": 0.01148317, + "auxiliary_loss_mlp": 0.01103088, + "balance_loss_clip": 1.00168324, + "balance_loss_mlp": 1.0003773, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 2.06703581307166, + "language_loss": 0.7425065, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76502061, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.6497793197631836 + }, + { + "auxiliary_loss_clip": 0.0110403, + "auxiliary_loss_mlp": 0.011024, + "balance_loss_clip": 1.00165343, + "balance_loss_mlp": 1.00045192, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.5576788130490289, + "language_loss": 0.59651458, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61857885, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 2.761641502380371 + }, + { + "auxiliary_loss_clip": 0.01081957, + "auxiliary_loss_mlp": 0.01102324, + "balance_loss_clip": 1.00134873, + "balance_loss_mlp": 1.0005672, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.6067469184744865, + "language_loss": 0.67518103, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69702387, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.6821508407592773 + }, + { + "auxiliary_loss_clip": 0.01133554, + "auxiliary_loss_mlp": 0.01102395, + "balance_loss_clip": 1.00172746, + "balance_loss_mlp": 1.00035214, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 1.747998394536292, + "language_loss": 0.78318286, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80554235, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.6163454055786133 + }, + { + "auxiliary_loss_clip": 0.01165061, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00038207, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6714108854903478, + "language_loss": 0.68982404, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71251035, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.464118003845215 + }, + { + "auxiliary_loss_clip": 0.01101406, + "auxiliary_loss_mlp": 0.01103106, + "balance_loss_clip": 1.00158036, + "balance_loss_mlp": 1.00039506, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.6669861536318584, + "language_loss": 0.76498258, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78702772, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.6608052253723145 + }, + { + "auxiliary_loss_clip": 0.01130318, + "auxiliary_loss_mlp": 0.01103767, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00038934, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.9588621742106358, + "language_loss": 0.67065823, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69299906, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.5936129093170166 + }, + { + "auxiliary_loss_clip": 0.01100295, + "auxiliary_loss_mlp": 0.01103355, + "balance_loss_clip": 1.00170207, + "balance_loss_mlp": 1.00045359, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.1050910594206895, + "language_loss": 0.70439887, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72643536, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.62610125541687 + }, + { + "auxiliary_loss_clip": 0.0116489, + "auxiliary_loss_mlp": 0.01102132, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00047088, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.5552997280591854, + "language_loss": 0.75536937, + "learning_rate": 8.105395723129315e-08, + "loss": 0.77803957, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.507906198501587 + }, + { + "auxiliary_loss_clip": 0.01150592, + "auxiliary_loss_mlp": 0.01103782, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00059474, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.0444801290145254, + "language_loss": 0.71955121, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74209499, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.564685344696045 + }, + { + "auxiliary_loss_clip": 0.01097069, + "auxiliary_loss_mlp": 0.0110337, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00046849, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.7965765388525052, + "language_loss": 0.72903156, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75103593, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.6374011039733887 + }, + { + "auxiliary_loss_clip": 0.01131004, + "auxiliary_loss_mlp": 0.01079391, + "balance_loss_clip": 1.00111914, + "balance_loss_mlp": 1.00004542, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.773052922521228, + "language_loss": 0.65620816, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67831218, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 4.530690908432007 + }, + { + "auxiliary_loss_clip": 0.01131677, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.00160742, + "balance_loss_mlp": 1.00046897, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.895659909735005, + "language_loss": 0.7810297, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80337536, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 2.5523931980133057 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01102844, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00041938, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 2.6056724308086325, + "language_loss": 0.82037985, + "learning_rate": 8.05061144198591e-08, + "loss": 0.8428911, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.5368094444274902 + }, + { + "auxiliary_loss_clip": 0.01149451, + "auxiliary_loss_mlp": 0.01103557, + "balance_loss_clip": 1.00189769, + "balance_loss_mlp": 1.00036955, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.428291618772815, + "language_loss": 0.77319235, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79572242, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.496830463409424 + }, + { + "auxiliary_loss_clip": 0.01064309, + "auxiliary_loss_mlp": 0.01102375, + "balance_loss_clip": 1.00144386, + "balance_loss_mlp": 1.00052285, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.332124336980744, + "language_loss": 0.67016292, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69182974, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.717864990234375 + }, + { + "auxiliary_loss_clip": 0.01134004, + "auxiliary_loss_mlp": 0.01103153, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00044179, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.6973188079239152, + "language_loss": 0.7521292, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77450073, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.622842311859131 + }, + { + "auxiliary_loss_clip": 0.01132874, + "auxiliary_loss_mlp": 0.01104149, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00048423, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.204441298300592, + "language_loss": 0.65565497, + "learning_rate": 8.00691503189499e-08, + "loss": 0.67802519, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.5332257747650146 + }, + { + "auxiliary_loss_clip": 0.01150334, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.00054789, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.9841765189568952, + "language_loss": 0.74997336, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77250922, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.5741379261016846 + }, + { + "auxiliary_loss_clip": 0.01143768, + "auxiliary_loss_mlp": 0.01079373, + "balance_loss_clip": 1.00116086, + "balance_loss_mlp": 1.0000273, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 1.3533450421439048, + "language_loss": 0.58451056, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60674191, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.1721956729888916 + }, + { + "auxiliary_loss_clip": 0.01135827, + "auxiliary_loss_mlp": 0.01103576, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00057888, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.6745888739366637, + "language_loss": 0.65129578, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67368984, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.5911567211151123 + }, + { + "auxiliary_loss_clip": 0.01133959, + "auxiliary_loss_mlp": 0.01102879, + "balance_loss_clip": 1.00195515, + "balance_loss_mlp": 1.00045419, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 2.3248031202607895, + "language_loss": 0.81012434, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83249271, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.540449619293213 + }, + { + "auxiliary_loss_clip": 0.01086841, + "auxiliary_loss_mlp": 0.01102346, + "balance_loss_clip": 1.0015676, + "balance_loss_mlp": 1.00039816, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.083244702800201, + "language_loss": 0.78762537, + "learning_rate": 7.952458331306711e-08, + "loss": 0.80951726, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.6730093955993652 + }, + { + "auxiliary_loss_clip": 0.01131353, + "auxiliary_loss_mlp": 0.01102447, + "balance_loss_clip": 1.00161994, + "balance_loss_mlp": 1.00049961, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 2.4863434449759243, + "language_loss": 0.67764336, + "learning_rate": 7.941588836924507e-08, + "loss": 0.69998139, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 4.018579006195068 + }, + { + "auxiliary_loss_clip": 0.01150137, + "auxiliary_loss_mlp": 0.01102337, + "balance_loss_clip": 1.00173938, + "balance_loss_mlp": 1.0003897, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.7345777556561839, + "language_loss": 0.74864757, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77117229, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 3.9208226203918457 + }, + { + "auxiliary_loss_clip": 0.01165137, + "auxiliary_loss_mlp": 0.01102745, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00051153, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.6602653453758247, + "language_loss": 0.74708146, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76976025, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.4950921535491943 + }, + { + "auxiliary_loss_clip": 0.01165125, + "auxiliary_loss_mlp": 0.01103274, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00046754, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.591194338951212, + "language_loss": 0.76271605, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78539997, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.5330111980438232 + }, + { + "auxiliary_loss_clip": 0.01148641, + "auxiliary_loss_mlp": 0.01103696, + "balance_loss_clip": 1.00176895, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.1701555076291696, + "language_loss": 0.76573926, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78826261, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.4922266006469727 + }, + { + "auxiliary_loss_clip": 0.01145748, + "auxiliary_loss_mlp": 0.01102823, + "balance_loss_clip": 1.00201607, + "balance_loss_mlp": 1.00049424, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.6767273497687571, + "language_loss": 0.74319935, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76568508, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 3.8809053897857666 + }, + { + "auxiliary_loss_clip": 0.01131465, + "auxiliary_loss_mlp": 0.01102128, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.0003705, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 1.8634668831485717, + "language_loss": 0.68660939, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70894533, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.55645489692688 + }, + { + "auxiliary_loss_clip": 0.0113361, + "auxiliary_loss_mlp": 0.01104138, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.00047326, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 3.2480890627320202, + "language_loss": 0.77543414, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79781163, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.5547618865966797 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.00747495, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00041378, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.0752579393732695, + "language_loss": 0.65864611, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67777163, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 2.5323970317840576 + }, + { + "auxiliary_loss_clip": 0.01068158, + "auxiliary_loss_mlp": 0.01101631, + "balance_loss_clip": 1.00142717, + "balance_loss_mlp": 1.00035083, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.7838164591584693, + "language_loss": 0.76222819, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78392613, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.698086977005005 + }, + { + "auxiliary_loss_clip": 0.01150227, + "auxiliary_loss_mlp": 0.01101362, + "balance_loss_clip": 1.00184977, + "balance_loss_mlp": 1.00046349, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.653641730986324, + "language_loss": 0.75294805, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77546394, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 2.539120674133301 + }, + { + "auxiliary_loss_clip": 0.01144437, + "auxiliary_loss_mlp": 0.01079443, + "balance_loss_clip": 1.0011791, + "balance_loss_mlp": 1.00009775, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.714579968488335, + "language_loss": 0.57350677, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59574562, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.1642212867736816 + }, + { + "auxiliary_loss_clip": 0.01165082, + "auxiliary_loss_mlp": 0.01102783, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.00054932, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.6823113820116662, + "language_loss": 0.74424314, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76692188, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.529541015625 + }, + { + "auxiliary_loss_clip": 0.01148175, + "auxiliary_loss_mlp": 0.01102255, + "balance_loss_clip": 1.0017935, + "balance_loss_mlp": 1.00040269, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.5469393480686664, + "language_loss": 0.69108504, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71358931, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.6880886554718018 + }, + { + "auxiliary_loss_clip": 0.01148248, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00062764, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 5.777457824766476, + "language_loss": 0.73251665, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75502491, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.5414414405822754 + }, + { + "auxiliary_loss_clip": 0.01101159, + "auxiliary_loss_mlp": 0.01104042, + "balance_loss_clip": 1.00161743, + "balance_loss_mlp": 1.00047314, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 2.246058594973931, + "language_loss": 0.61665875, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63871074, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.652909755706787 + }, + { + "auxiliary_loss_clip": 0.01150271, + "auxiliary_loss_mlp": 0.01103376, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00047493, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.4899794724960422, + "language_loss": 0.71419877, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73673522, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.580310821533203 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.01102227, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.0004704, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.5283500481765595, + "language_loss": 0.71108294, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73343927, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.6204915046691895 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01102246, + "balance_loss_clip": 1.00162399, + "balance_loss_mlp": 1.00034547, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.7953439747136297, + "language_loss": 0.77815229, + "learning_rate": 7.747183707589489e-08, + "loss": 0.80050516, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.01148327, + "auxiliary_loss_mlp": 0.01102727, + "balance_loss_clip": 1.00165522, + "balance_loss_mlp": 1.00049329, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.4937768890438463, + "language_loss": 0.67874807, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70125854, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.5491976737976074 + }, + { + "auxiliary_loss_clip": 0.01149844, + "auxiliary_loss_mlp": 0.00747308, + "balance_loss_clip": 1.00183737, + "balance_loss_mlp": 1.00037622, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.5017493264077717, + "language_loss": 0.67740119, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69637263, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.61651349067688 + }, + { + "auxiliary_loss_clip": 0.01147803, + "auxiliary_loss_mlp": 0.01102417, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00056422, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.827324810131206, + "language_loss": 0.71417058, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73667276, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.5445189476013184 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01101553, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00036812, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.7430260612939958, + "language_loss": 0.70381671, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72631383, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.495419502258301 + }, + { + "auxiliary_loss_clip": 0.01164871, + "auxiliary_loss_mlp": 0.01103038, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00051856, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.8760551890149544, + "language_loss": 0.66377878, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68645787, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.467595338821411 + }, + { + "auxiliary_loss_clip": 0.01149839, + "auxiliary_loss_mlp": 0.01103214, + "balance_loss_clip": 1.00191259, + "balance_loss_mlp": 1.00059843, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.418355754255177, + "language_loss": 0.68514127, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70767182, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.55324387550354 + }, + { + "auxiliary_loss_clip": 0.01149702, + "auxiliary_loss_mlp": 0.01102962, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.0003469, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 2.0189705860038023, + "language_loss": 0.60095286, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62347949, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 2.555314302444458 + }, + { + "auxiliary_loss_clip": 0.01113788, + "auxiliary_loss_mlp": 0.01102787, + "balance_loss_clip": 1.00182927, + "balance_loss_mlp": 1.00045764, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.5522501003739049, + "language_loss": 0.8109265, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83309221, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 4.068102121353149 + }, + { + "auxiliary_loss_clip": 0.01100683, + "auxiliary_loss_mlp": 0.01102959, + "balance_loss_clip": 1.00164318, + "balance_loss_mlp": 1.00034356, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.3710714007287246, + "language_loss": 0.73908234, + "learning_rate": 7.650866758767382e-08, + "loss": 0.76111871, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.6340982913970947 + }, + { + "auxiliary_loss_clip": 0.0109961, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.00164557, + "balance_loss_mlp": 1.00057268, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.675696906733597, + "language_loss": 0.72636145, + "learning_rate": 7.640201339654373e-08, + "loss": 0.74838847, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 2.664822816848755 + }, + { + "auxiliary_loss_clip": 0.01145721, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00038254, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.0754468332381033, + "language_loss": 0.86386228, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88635039, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.5020275115966797 + }, + { + "auxiliary_loss_clip": 0.01133005, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00068092, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.5955043459643183, + "language_loss": 0.75504649, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77739894, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.597747802734375 + }, + { + "auxiliary_loss_clip": 0.01133293, + "auxiliary_loss_mlp": 0.01102619, + "balance_loss_clip": 1.00157714, + "balance_loss_mlp": 1.00038528, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.86151669537701, + "language_loss": 0.77658224, + "learning_rate": 7.6082488497488e-08, + "loss": 0.79894137, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.600564479827881 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0110256, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.0004214, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.8833565787284694, + "language_loss": 0.8273946, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84991944, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.547913074493408 + }, + { + "auxiliary_loss_clip": 0.01148557, + "auxiliary_loss_mlp": 0.0110175, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.00037456, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.824444907806101, + "language_loss": 0.83940363, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86190677, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.5118248462677 + }, + { + "auxiliary_loss_clip": 0.01145534, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.657568622955984, + "language_loss": 0.71075559, + "learning_rate": 7.576362019471894e-08, + "loss": 0.73323756, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 2.525967597961426 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00058162, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.50804875358808, + "language_loss": 0.62787163, + "learning_rate": 7.565747668956413e-08, + "loss": 0.65039957, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.5690183639526367 + }, + { + "auxiliary_loss_clip": 0.0111835, + "auxiliary_loss_mlp": 0.01103662, + "balance_loss_clip": 1.00157654, + "balance_loss_mlp": 1.00047457, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.2794321791361907, + "language_loss": 0.76529205, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78751212, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.586228847503662 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00046813, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.226575374573666, + "language_loss": 0.67524302, + "learning_rate": 7.544540859706062e-08, + "loss": 0.69760162, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.573317050933838 + }, + { + "auxiliary_loss_clip": 0.01148339, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.00192964, + "balance_loss_mlp": 1.00040388, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.8450653218283686, + "language_loss": 0.80136979, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82387859, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 3.91359281539917 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01078966, + "balance_loss_clip": 1.00118923, + "balance_loss_mlp": 1.00000179, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8421124683406302, + "language_loss": 0.59270287, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61459547, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 4.535735368728638 + }, + { + "auxiliary_loss_clip": 0.01150106, + "auxiliary_loss_mlp": 0.01102471, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.0006181, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 1.7928430515357419, + "language_loss": 0.78993624, + "learning_rate": 7.512785381311216e-08, + "loss": 0.81246197, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.5218358039855957 + }, + { + "auxiliary_loss_clip": 0.0110024, + "auxiliary_loss_mlp": 0.01103581, + "balance_loss_clip": 1.00163543, + "balance_loss_mlp": 1.00039363, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.9826371250731143, + "language_loss": 0.66105074, + "learning_rate": 7.50221481958031e-08, + "loss": 0.6830889, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.6160545349121094 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.01102876, + "balance_loss_clip": 1.0016464, + "balance_loss_mlp": 1.00054693, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.6300255452044452, + "language_loss": 0.84372848, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86607599, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.56316876411438 + }, + { + "auxiliary_loss_clip": 0.01129201, + "auxiliary_loss_mlp": 0.01079453, + "balance_loss_clip": 1.00123668, + "balance_loss_mlp": 1.00010705, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.8274790059331064, + "language_loss": 0.49621052, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51829708, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 4.632168292999268 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.01102697, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00065339, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 1.6802521046331196, + "language_loss": 0.72318733, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74536633, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.604321002960205 + }, + { + "auxiliary_loss_clip": 0.01148106, + "auxiliary_loss_mlp": 0.01102646, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00031686, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.886996015162526, + "language_loss": 0.81054509, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83305258, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.5402345657348633 + }, + { + "auxiliary_loss_clip": 0.01165076, + "auxiliary_loss_mlp": 0.01103266, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00026917, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.2993992121223126, + "language_loss": 0.71099007, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73367345, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.5899338722229004 + }, + { + "auxiliary_loss_clip": 0.01067032, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.00163877, + "balance_loss_mlp": 1.00044656, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.8087182412817318, + "language_loss": 0.74836725, + "learning_rate": 7.43894475344613e-08, + "loss": 0.77007198, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 2.765825033187866 + }, + { + "auxiliary_loss_clip": 0.01133338, + "auxiliary_loss_mlp": 0.01102449, + "balance_loss_clip": 1.00175238, + "balance_loss_mlp": 1.00050139, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4109480099085132, + "language_loss": 0.7410444, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76340222, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 2.608306646347046 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01101816, + "balance_loss_clip": 1.00153875, + "balance_loss_mlp": 1.00043988, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.5580999911095088, + "language_loss": 0.72051704, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74270654, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.617889881134033 + }, + { + "auxiliary_loss_clip": 0.0116502, + "auxiliary_loss_mlp": 0.01102836, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00050664, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.447110898189105, + "language_loss": 0.82912838, + "learning_rate": 7.407408291099848e-08, + "loss": 0.851807, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.4893460273742676 + }, + { + "auxiliary_loss_clip": 0.01103574, + "auxiliary_loss_mlp": 0.0110195, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00047898, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.7046987554332997, + "language_loss": 0.83818245, + "learning_rate": 7.396910742713957e-08, + "loss": 0.86023772, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.691396474838257 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00170314, + "balance_loss_mlp": 1.00027955, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.4251609270513408, + "language_loss": 0.72527897, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74780542, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.5792627334594727 + }, + { + "auxiliary_loss_clip": 0.01164963, + "auxiliary_loss_mlp": 0.01102861, + "balance_loss_clip": 1.00184214, + "balance_loss_mlp": 1.00043666, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.8932526513228436, + "language_loss": 0.67219019, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69486845, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.4677395820617676 + }, + { + "auxiliary_loss_clip": 0.01132094, + "auxiliary_loss_mlp": 0.01104068, + "balance_loss_clip": 1.00190115, + "balance_loss_mlp": 1.00049853, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 2.3097160861287924, + "language_loss": 0.69544852, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71781009, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.5747251510620117 + }, + { + "auxiliary_loss_clip": 0.01131554, + "auxiliary_loss_mlp": 0.01103705, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00042236, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.602255186279799, + "language_loss": 0.87991321, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90226573, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.5908656120300293 + }, + { + "auxiliary_loss_clip": 0.01087771, + "auxiliary_loss_mlp": 0.0110404, + "balance_loss_clip": 1.00177062, + "balance_loss_mlp": 1.00056648, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.5431656152395594, + "language_loss": 0.76894087, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79085898, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.7543885707855225 + }, + { + "auxiliary_loss_clip": 0.01083118, + "auxiliary_loss_mlp": 0.01079734, + "balance_loss_clip": 1.001122, + "balance_loss_mlp": 1.00000715, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6707967303032366, + "language_loss": 0.62230909, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64393759, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.2821946144104004 + }, + { + "auxiliary_loss_clip": 0.01165083, + "auxiliary_loss_mlp": 0.00747302, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00050044, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 1.8975510531706756, + "language_loss": 0.74740165, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76652551, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.6589202880859375 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.6908161633412926, + "language_loss": 0.74848491, + "learning_rate": 7.313193316030464e-08, + "loss": 0.77116227, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.548814058303833 + }, + { + "auxiliary_loss_clip": 0.01117871, + "auxiliary_loss_mlp": 0.01103084, + "balance_loss_clip": 1.00164282, + "balance_loss_mlp": 1.00046861, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.2200764126292407, + "language_loss": 0.63413131, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65634084, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.628255605697632 + }, + { + "auxiliary_loss_clip": 0.01133585, + "auxiliary_loss_mlp": 0.00747194, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00031996, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.9064518490693216, + "language_loss": 0.76641172, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78521955, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.59191632270813 + }, + { + "auxiliary_loss_clip": 0.01148107, + "auxiliary_loss_mlp": 0.01105016, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00039816, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.027491853690009, + "language_loss": 0.67376244, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69629371, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.533015727996826 + }, + { + "auxiliary_loss_clip": 0.01150351, + "auxiliary_loss_mlp": 0.01102913, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00048828, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.7064847681802415, + "language_loss": 0.80549973, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82803237, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.5023131370544434 + }, + { + "auxiliary_loss_clip": 0.01135166, + "auxiliary_loss_mlp": 0.01103665, + "balance_loss_clip": 1.00180757, + "balance_loss_mlp": 1.00047791, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.8397156826219676, + "language_loss": 0.82335901, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84574735, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 3.9639086723327637 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00064135, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.369422573785079, + "language_loss": 0.72403502, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74672568, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 2.4884397983551025 + }, + { + "auxiliary_loss_clip": 0.01130947, + "auxiliary_loss_mlp": 0.01101925, + "balance_loss_clip": 1.00165796, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 5.682281711795024, + "language_loss": 0.74971759, + "learning_rate": 7.240324162598033e-08, + "loss": 0.77204633, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.547560453414917 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.0004555, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 2.049110632415226, + "language_loss": 0.75301075, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77538067, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.5399510860443115 + }, + { + "auxiliary_loss_clip": 0.01148326, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00043368, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.5008329099954856, + "language_loss": 0.7596119, + "learning_rate": 7.219570183756052e-08, + "loss": 0.7821219, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.5581326484680176 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00058877, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.099017206147891, + "language_loss": 0.72899699, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75154501, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.5831339359283447 + }, + { + "auxiliary_loss_clip": 0.01102226, + "auxiliary_loss_mlp": 0.01102949, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00042915, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 2.0322581465768454, + "language_loss": 0.75846732, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78051901, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.641056776046753 + }, + { + "auxiliary_loss_clip": 0.01101749, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_clip": 1.00163591, + "balance_loss_mlp": 1.00037193, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.6142109731139453, + "language_loss": 0.75813842, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78018671, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.6577415466308594 + }, + { + "auxiliary_loss_clip": 0.01133793, + "auxiliary_loss_mlp": 0.01103224, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 5.168330914474201, + "language_loss": 0.80113554, + "learning_rate": 7.178149952253298e-08, + "loss": 0.8235057, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.597719430923462 + }, + { + "auxiliary_loss_clip": 0.01164928, + "auxiliary_loss_mlp": 0.01103119, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00050378, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.5909148088827765, + "language_loss": 0.77447367, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79715413, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.51942777633667 + }, + { + "auxiliary_loss_clip": 0.01148439, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00037611, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.7280228333253, + "language_loss": 0.72955841, + "learning_rate": 7.157483705875256e-08, + "loss": 0.7520746, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.5625240802764893 + }, + { + "auxiliary_loss_clip": 0.01114789, + "auxiliary_loss_mlp": 0.01101981, + "balance_loss_clip": 1.0014782, + "balance_loss_mlp": 1.00041485, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.4183848698820276, + "language_loss": 0.78967774, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81184542, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.656635046005249 + }, + { + "auxiliary_loss_clip": 0.0114821, + "auxiliary_loss_mlp": 0.01102984, + "balance_loss_clip": 1.0017153, + "balance_loss_mlp": 1.00055933, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 1.7808717648761392, + "language_loss": 0.68566656, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70817846, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 4.15198016166687 + }, + { + "auxiliary_loss_clip": 0.01150008, + "auxiliary_loss_mlp": 0.01102279, + "balance_loss_clip": 1.00186527, + "balance_loss_mlp": 1.00052238, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.5811134420019608, + "language_loss": 0.83840102, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86092389, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 3.941985607147217 + }, + { + "auxiliary_loss_clip": 0.0113543, + "auxiliary_loss_mlp": 0.01102642, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00050342, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.5358572731945466, + "language_loss": 0.77512455, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79750532, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.5966908931732178 + }, + { + "auxiliary_loss_clip": 0.01145673, + "auxiliary_loss_mlp": 0.01103238, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00052738, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.959284537880934, + "language_loss": 0.78598267, + "learning_rate": 7.105946067406999e-08, + "loss": 0.8084718, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.5101139545440674 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01102386, + "balance_loss_clip": 1.00156879, + "balance_loss_mlp": 1.00043869, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.5575097340294477, + "language_loss": 0.76305568, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78508306, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.6804089546203613 + }, + { + "auxiliary_loss_clip": 0.01103227, + "auxiliary_loss_mlp": 0.01101832, + "balance_loss_clip": 1.00159168, + "balance_loss_mlp": 1.00036061, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.4823898778832836, + "language_loss": 0.60714471, + "learning_rate": 7.085382211218637e-08, + "loss": 0.62919533, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 4.0928215980529785 + }, + { + "auxiliary_loss_clip": 0.01133559, + "auxiliary_loss_mlp": 0.0110306, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00054002, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.8139536342123819, + "language_loss": 0.73763388, + "learning_rate": 7.075111255942002e-08, + "loss": 0.76000005, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.541639566421509 + }, + { + "auxiliary_loss_clip": 0.01165079, + "auxiliary_loss_mlp": 0.01102949, + "balance_loss_clip": 1.00172234, + "balance_loss_mlp": 1.00042951, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 2.760667968770458, + "language_loss": 0.77642399, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79910427, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 2.4903769493103027 + }, + { + "auxiliary_loss_clip": 0.01165158, + "auxiliary_loss_mlp": 0.01103594, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.00050139, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.9257555927233174, + "language_loss": 0.75993383, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78262132, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.493370294570923 + }, + { + "auxiliary_loss_clip": 0.01131759, + "auxiliary_loss_mlp": 0.0110314, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00052452, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.8629856725557539, + "language_loss": 0.8315118, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85386074, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.01165208, + "auxiliary_loss_mlp": 0.01103716, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00052857, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.5026526394518025, + "language_loss": 0.73301452, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75570381, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 2.5187900066375732 + }, + { + "auxiliary_loss_clip": 0.0116492, + "auxiliary_loss_mlp": 0.01103208, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00040197, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.5717242669224567, + "language_loss": 0.7740382, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79671943, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.483459234237671 + }, + { + "auxiliary_loss_clip": 0.01143304, + "auxiliary_loss_mlp": 0.0074639, + "balance_loss_clip": 1.00122428, + "balance_loss_mlp": 1.00092685, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7364700675888691, + "language_loss": 0.5631848, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58208174, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.1362459659576416 + }, + { + "auxiliary_loss_clip": 0.01165023, + "auxiliary_loss_mlp": 0.0074724, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00037575, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.5986437192997358, + "language_loss": 0.75977832, + "learning_rate": 7.0034194312526e-08, + "loss": 0.77890098, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.488253355026245 + }, + { + "auxiliary_loss_clip": 0.01116323, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_clip": 1.00165987, + "balance_loss_mlp": 1.0004338, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 1.9264560937026558, + "language_loss": 0.72979218, + "learning_rate": 6.993207012706936e-08, + "loss": 0.75197828, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.771327495574951 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01102311, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00055361, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5947058797686222, + "language_loss": 0.79760796, + "learning_rate": 6.98300191299821e-08, + "loss": 0.8202796, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.559511423110962 + }, + { + "auxiliary_loss_clip": 0.01119477, + "auxiliary_loss_mlp": 0.01102934, + "balance_loss_clip": 1.00164294, + "balance_loss_mlp": 1.00050938, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 1.7476493205597863, + "language_loss": 0.72790247, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75012654, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.660055160522461 + }, + { + "auxiliary_loss_clip": 0.01128861, + "auxiliary_loss_mlp": 0.01102176, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00070524, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.9232846714620022, + "language_loss": 0.73140806, + "learning_rate": 6.962613671639105e-08, + "loss": 0.75371844, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.641721725463867 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01101387, + "balance_loss_clip": 1.00164318, + "balance_loss_mlp": 1.00039268, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.497268051901324, + "language_loss": 0.74205703, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76421493, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.6520626544952393 + }, + { + "auxiliary_loss_clip": 0.0115027, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00062501, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.4020722816733975, + "language_loss": 0.68957651, + "learning_rate": 6.942254710267902e-08, + "loss": 0.71210587, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.5390660762786865 + }, + { + "auxiliary_loss_clip": 0.01148039, + "auxiliary_loss_mlp": 0.01102775, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00044584, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.8728622169259577, + "language_loss": 0.72540903, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74791723, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.514956474304199 + }, + { + "auxiliary_loss_clip": 0.01131344, + "auxiliary_loss_mlp": 0.01101953, + "balance_loss_clip": 1.00174344, + "balance_loss_mlp": 1.00048232, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 3.264326549227467, + "language_loss": 0.73322743, + "learning_rate": 6.921925031972642e-08, + "loss": 0.7555604, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.593902349472046 + }, + { + "auxiliary_loss_clip": 0.01113727, + "auxiliary_loss_mlp": 0.01079384, + "balance_loss_clip": 1.00093198, + "balance_loss_mlp": 1.00003862, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7160841789607083, + "language_loss": 0.5922364, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61416745, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.3081743717193604 + }, + { + "auxiliary_loss_clip": 0.01114127, + "auxiliary_loss_mlp": 0.01101857, + "balance_loss_clip": 1.00148284, + "balance_loss_mlp": 1.000386, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.5451019719803156, + "language_loss": 0.6420362, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66419613, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.5967118740081787 + }, + { + "auxiliary_loss_clip": 0.01160334, + "auxiliary_loss_mlp": 0.00746308, + "balance_loss_clip": 1.001127, + "balance_loss_mlp": 1.0007565, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8497157287662053, + "language_loss": 0.60221165, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62127811, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.0549352169036865 + }, + { + "auxiliary_loss_clip": 0.01133594, + "auxiliary_loss_mlp": 0.01102927, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00040686, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.6700845909184352, + "language_loss": 0.70106804, + "learning_rate": 6.881353536939815e-08, + "loss": 0.72343326, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.5637927055358887 + }, + { + "auxiliary_loss_clip": 0.01131217, + "auxiliary_loss_mlp": 0.01102744, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.00031948, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.779443918954944, + "language_loss": 0.84422731, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86656696, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 4.102548599243164 + }, + { + "auxiliary_loss_clip": 0.01133526, + "auxiliary_loss_mlp": 0.01103638, + "balance_loss_clip": 1.00188303, + "balance_loss_mlp": 1.00054526, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.8662277731414698, + "language_loss": 0.60221106, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62458265, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.544883966445923 + }, + { + "auxiliary_loss_clip": 0.01148544, + "auxiliary_loss_mlp": 0.00747396, + "balance_loss_clip": 1.0018934, + "balance_loss_mlp": 1.0003376, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.7428809779594225, + "language_loss": 0.65421456, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67317396, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 2.5715949535369873 + }, + { + "auxiliary_loss_clip": 0.01164871, + "auxiliary_loss_mlp": 0.01103026, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.0004102, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8002864041177689, + "language_loss": 0.73528719, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75796616, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.502866506576538 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01102464, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00042045, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.9712191066937725, + "language_loss": 0.7171092, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73978126, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.4915294647216797 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01102955, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00053024, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 1.6777767009519178, + "language_loss": 0.73658729, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75926685, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.5304620265960693 + }, + { + "auxiliary_loss_clip": 0.01165112, + "auxiliary_loss_mlp": 0.01103289, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00038731, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 2.103486265498875, + "language_loss": 0.65133393, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67401791, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.4705450534820557 + }, + { + "auxiliary_loss_clip": 0.0116513, + "auxiliary_loss_mlp": 0.01102768, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00053418, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 1.8725551154137812, + "language_loss": 0.71077967, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73345864, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.5753700733184814 + }, + { + "auxiliary_loss_clip": 0.01119184, + "auxiliary_loss_mlp": 0.01103034, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00051379, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 2.0121323960399353, + "language_loss": 0.74349898, + "learning_rate": 6.790496110568921e-08, + "loss": 0.7657212, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.5671627521514893 + }, + { + "auxiliary_loss_clip": 0.01102753, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_clip": 1.00169134, + "balance_loss_mlp": 1.00041413, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 2.0200048313972934, + "language_loss": 0.71779346, + "learning_rate": 6.78043746849506e-08, + "loss": 0.73984182, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.6970512866973877 + }, + { + "auxiliary_loss_clip": 0.01133393, + "auxiliary_loss_mlp": 0.01102464, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00042057, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.590083146907229, + "language_loss": 0.70982003, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73217863, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.5846567153930664 + }, + { + "auxiliary_loss_clip": 0.01133565, + "auxiliary_loss_mlp": 0.01102703, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.19910263649062, + "language_loss": 0.72928077, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75164342, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 4.035336494445801 + }, + { + "auxiliary_loss_clip": 0.01165005, + "auxiliary_loss_mlp": 0.01102952, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00043213, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.7954995273668355, + "language_loss": 0.78082776, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80350739, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 3.8780009746551514 + }, + { + "auxiliary_loss_clip": 0.01134806, + "auxiliary_loss_mlp": 0.01103691, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00050402, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 2.139924945448571, + "language_loss": 0.7703979, + "learning_rate": 6.74027617306141e-08, + "loss": 0.7927829, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 2.619253158569336 + }, + { + "auxiliary_loss_clip": 0.01164886, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.00194466, + "balance_loss_mlp": 1.00052333, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.199059086521358, + "language_loss": 0.71262038, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73529017, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.5590460300445557 + }, + { + "auxiliary_loss_clip": 0.01164788, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_clip": 1.00184119, + "balance_loss_mlp": 1.00065184, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.0344162546969202, + "language_loss": 0.75217676, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77485257, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 3.884791851043701 + }, + { + "auxiliary_loss_clip": 0.01148476, + "auxiliary_loss_mlp": 0.00747461, + "balance_loss_clip": 1.00166368, + "balance_loss_mlp": 1.00038099, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.754616009160477, + "language_loss": 0.7326926, + "learning_rate": 6.710232148647676e-08, + "loss": 0.751652, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.566457748413086 + }, + { + "auxiliary_loss_clip": 0.01132017, + "auxiliary_loss_mlp": 0.01103246, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00053537, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.3548463290359294, + "language_loss": 0.79159617, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81394887, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.5358870029449463 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.01102223, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00037038, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 1.7919129693950695, + "language_loss": 0.64055502, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66276062, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.5907795429229736 + }, + { + "auxiliary_loss_clip": 0.01131226, + "auxiliary_loss_mlp": 0.00747188, + "balance_loss_clip": 1.00174415, + "balance_loss_mlp": 1.00036168, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.7099964675850483, + "language_loss": 0.69780493, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71658909, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.580613374710083 + }, + { + "auxiliary_loss_clip": 0.01150272, + "auxiliary_loss_mlp": 0.01104187, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00061762, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.9028753218654244, + "language_loss": 0.70922089, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73176551, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.507997512817383 + }, + { + "auxiliary_loss_clip": 0.01165105, + "auxiliary_loss_mlp": 0.01103582, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00049007, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.708578794132634, + "language_loss": 0.76527309, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78795999, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.540165901184082 + }, + { + "auxiliary_loss_clip": 0.01133356, + "auxiliary_loss_mlp": 0.01103422, + "balance_loss_clip": 1.00202417, + "balance_loss_mlp": 1.00052094, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.245521981321787, + "language_loss": 0.87554514, + "learning_rate": 6.650342008365006e-08, + "loss": 0.89791286, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.5323331356048584 + }, + { + "auxiliary_loss_clip": 0.01085948, + "auxiliary_loss_mlp": 0.01103618, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00043046, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 2.2283085169972408, + "language_loss": 0.76841134, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79030704, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 2.6848931312561035 + }, + { + "auxiliary_loss_clip": 0.01145739, + "auxiliary_loss_mlp": 0.01102804, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00057065, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.464449525883373, + "language_loss": 0.81829482, + "learning_rate": 6.630437278944501e-08, + "loss": 0.84078026, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.5863308906555176 + }, + { + "auxiliary_loss_clip": 0.01117018, + "auxiliary_loss_mlp": 0.01102645, + "balance_loss_clip": 1.00159132, + "balance_loss_mlp": 1.00050652, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.9491232862059802, + "language_loss": 0.72349501, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74569166, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.576223611831665 + }, + { + "auxiliary_loss_clip": 0.01148299, + "auxiliary_loss_mlp": 0.01103335, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 2.2393579834926403, + "language_loss": 0.78507864, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80759501, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 2.5629701614379883 + }, + { + "auxiliary_loss_clip": 0.01133418, + "auxiliary_loss_mlp": 0.01102698, + "balance_loss_clip": 1.00169277, + "balance_loss_mlp": 1.0004642, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.181834884184801, + "language_loss": 0.78137529, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80373645, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.5499613285064697 + }, + { + "auxiliary_loss_clip": 0.01099329, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.0004344, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.8019836758064187, + "language_loss": 0.66815466, + "learning_rate": 6.590715814235781e-08, + "loss": 0.69017363, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.6195428371429443 + }, + { + "auxiliary_loss_clip": 0.01085342, + "auxiliary_loss_mlp": 0.01103062, + "balance_loss_clip": 1.00169754, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6324839541239597, + "language_loss": 0.66114569, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68302971, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.697575807571411 + }, + { + "auxiliary_loss_clip": 0.01148613, + "auxiliary_loss_mlp": 0.01102884, + "balance_loss_clip": 1.00174844, + "balance_loss_mlp": 1.00045943, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.6188119794574556, + "language_loss": 0.76290846, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78542352, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.575413942337036 + }, + { + "auxiliary_loss_clip": 0.01148156, + "auxiliary_loss_mlp": 0.01102456, + "balance_loss_clip": 1.00183737, + "balance_loss_mlp": 1.00050783, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.613087652246097, + "language_loss": 0.79146576, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81397188, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.5469350814819336 + }, + { + "auxiliary_loss_clip": 0.01149819, + "auxiliary_loss_mlp": 0.01102599, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00046039, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 2.3738264267097056, + "language_loss": 0.78448987, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80701405, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.5360605716705322 + }, + { + "auxiliary_loss_clip": 0.01132841, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00039077, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.7496208671497024, + "language_loss": 0.79019427, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81255656, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.5796115398406982 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00047839, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.91150235874243, + "language_loss": 0.75714463, + "learning_rate": 6.531353647657156e-08, + "loss": 0.77949774, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.01165063, + "auxiliary_loss_mlp": 0.01102614, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00057125, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.6696393853838807, + "language_loss": 0.69401813, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71669495, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.5519139766693115 + }, + { + "auxiliary_loss_clip": 0.01148334, + "auxiliary_loss_mlp": 0.01103958, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00058007, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 2.3577108765430657, + "language_loss": 0.83547056, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85799348, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.570417642593384 + }, + { + "auxiliary_loss_clip": 0.01131387, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00175917, + "balance_loss_mlp": 1.00042725, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.9111554947203098, + "language_loss": 0.85512352, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87745732, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.533721446990967 + }, + { + "auxiliary_loss_clip": 0.01160291, + "auxiliary_loss_mlp": 0.01079392, + "balance_loss_clip": 1.00109851, + "balance_loss_mlp": 1.00004661, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7700419198686668, + "language_loss": 0.56221104, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58460784, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 4.604033946990967 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01104638, + "balance_loss_clip": 1.00171709, + "balance_loss_mlp": 1.00059235, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 1.9001339086625908, + "language_loss": 0.64258933, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66464001, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.6225924491882324 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00053215, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.4379236185451392, + "language_loss": 0.71590507, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73811585, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.6488804817199707 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.01103081, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.00056052, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.7990937190487382, + "language_loss": 0.69652975, + "learning_rate": 6.462431596227725e-08, + "loss": 0.71854442, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 2.6270251274108887 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01103764, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00048089, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 2.412415225560998, + "language_loss": 0.74417371, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76654905, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.5668699741363525 + }, + { + "auxiliary_loss_clip": 0.01148341, + "auxiliary_loss_mlp": 0.0110352, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.0007143, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 2.150091153520115, + "language_loss": 0.71008527, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73260391, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.5365161895751953 + }, + { + "auxiliary_loss_clip": 0.0113375, + "auxiliary_loss_mlp": 0.01102124, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00055778, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.74789775593966, + "language_loss": 0.78459901, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80695778, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.62298846244812 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01103451, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.00045466, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.148840213966899, + "language_loss": 0.71129882, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73381686, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.5029516220092773 + }, + { + "auxiliary_loss_clip": 0.01148025, + "auxiliary_loss_mlp": 0.0110414, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00057149, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.636594229313971, + "language_loss": 0.77930158, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80182326, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.5647263526916504 + }, + { + "auxiliary_loss_clip": 0.01132498, + "auxiliary_loss_mlp": 0.01101935, + "balance_loss_clip": 1.00166488, + "balance_loss_mlp": 1.00046408, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.45278485553139, + "language_loss": 0.70992988, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73227417, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.615744113922119 + }, + { + "auxiliary_loss_clip": 0.01148106, + "auxiliary_loss_mlp": 0.01101735, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.0004549, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 2.337918597182051, + "language_loss": 0.86370111, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88619953, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.4955599308013916 + }, + { + "auxiliary_loss_clip": 0.01120681, + "auxiliary_loss_mlp": 0.01103153, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00044227, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 1.9291248572062547, + "language_loss": 0.75500453, + "learning_rate": 6.384103882660397e-08, + "loss": 0.7772429, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 4.161783933639526 + }, + { + "auxiliary_loss_clip": 0.01148156, + "auxiliary_loss_mlp": 0.0110217, + "balance_loss_clip": 1.00173903, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.696676957966466, + "language_loss": 0.75046182, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77296507, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.5216095447540283 + }, + { + "auxiliary_loss_clip": 0.01097535, + "auxiliary_loss_mlp": 0.01101949, + "balance_loss_clip": 1.00167954, + "balance_loss_mlp": 1.00047803, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 2.0880522600009, + "language_loss": 0.74616128, + "learning_rate": 6.364595366195358e-08, + "loss": 0.76815611, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.6267828941345215 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_clip": 1.001212, + "balance_loss_mlp": 0.99998134, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8121994946289985, + "language_loss": 0.52828968, + "learning_rate": 6.354852121788879e-08, + "loss": 0.55051601, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.1217143535614014 + }, + { + "auxiliary_loss_clip": 0.01133717, + "auxiliary_loss_mlp": 0.01102215, + "balance_loss_clip": 1.00153577, + "balance_loss_mlp": 1.00045776, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.7237271204790054, + "language_loss": 0.62448037, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64683962, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 3.9550251960754395 + }, + { + "auxiliary_loss_clip": 0.01101369, + "auxiliary_loss_mlp": 0.0110299, + "balance_loss_clip": 1.00183618, + "balance_loss_mlp": 1.00037479, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.7200648898030049, + "language_loss": 0.7162528, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73829639, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.7031545639038086 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.0110183, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.00054979, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.9019956288523627, + "language_loss": 0.71713805, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73947519, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.535391330718994 + }, + { + "auxiliary_loss_clip": 0.01145837, + "auxiliary_loss_mlp": 0.01079399, + "balance_loss_clip": 1.00110817, + "balance_loss_mlp": 1.00005376, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8804411675660492, + "language_loss": 0.6531477, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67540008, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.0658457279205322 + }, + { + "auxiliary_loss_clip": 0.01149436, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.0004667, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.7059223718439336, + "language_loss": 0.66708016, + "learning_rate": 6.306246052787289e-08, + "loss": 0.68960249, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.6035072803497314 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.0110269, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.0003612, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.9280761221693192, + "language_loss": 0.71741629, + "learning_rate": 6.296546872173513e-08, + "loss": 0.74009365, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.5331408977508545 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01103027, + "balance_loss_clip": 1.00180256, + "balance_loss_mlp": 1.00050712, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.425222099321563, + "language_loss": 0.70160598, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72380322, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.68870210647583 + }, + { + "auxiliary_loss_clip": 0.01098704, + "auxiliary_loss_mlp": 0.01102334, + "balance_loss_clip": 1.00165069, + "balance_loss_mlp": 1.00057685, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.9481578619908126, + "language_loss": 0.67735618, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69936663, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.718815326690674 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01103325, + "balance_loss_clip": 1.00163651, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 2.000607857059036, + "language_loss": 0.69602239, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71805131, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 2.8819456100463867 + }, + { + "auxiliary_loss_clip": 0.01129417, + "auxiliary_loss_mlp": 0.01079042, + "balance_loss_clip": 1.00107193, + "balance_loss_mlp": 1.00007749, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7154878939565347, + "language_loss": 0.51989502, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54197961, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 3.3528635501861572 + }, + { + "auxiliary_loss_clip": 0.01164951, + "auxiliary_loss_mlp": 0.0110214, + "balance_loss_clip": 1.00199306, + "balance_loss_mlp": 1.00047815, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.6093319562569757, + "language_loss": 0.70331424, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72598517, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.4943954944610596 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01102654, + "balance_loss_clip": 1.00165737, + "balance_loss_mlp": 1.00061095, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.7028661988159237, + "language_loss": 0.77623987, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79858589, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.5781850814819336 + }, + { + "auxiliary_loss_clip": 0.01131804, + "auxiliary_loss_mlp": 0.01104085, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00051618, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 3.419500995744636, + "language_loss": 0.76179969, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78415859, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.5484161376953125 + }, + { + "auxiliary_loss_clip": 0.01148227, + "auxiliary_loss_mlp": 0.01101756, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00047612, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.519149471463696, + "language_loss": 0.76862109, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79112095, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 2.5387604236602783 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01103187, + "balance_loss_clip": 1.00176609, + "balance_loss_mlp": 1.00047588, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 2.538520568224086, + "language_loss": 0.67830789, + "learning_rate": 6.209584827138959e-08, + "loss": 0.70067066, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.6006834506988525 + }, + { + "auxiliary_loss_clip": 0.01115629, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.0004077, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 4.040374236340838, + "language_loss": 0.86745238, + "learning_rate": 6.199959115573495e-08, + "loss": 0.88964844, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.5779078006744385 + }, + { + "auxiliary_loss_clip": 0.01128765, + "auxiliary_loss_mlp": 0.01079396, + "balance_loss_clip": 1.00104368, + "balance_loss_mlp": 1.00005054, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7664684456063482, + "language_loss": 0.60357046, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62565207, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.1056647300720215 + }, + { + "auxiliary_loss_clip": 0.01131597, + "auxiliary_loss_mlp": 0.01103172, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00036573, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.1864594786196965, + "language_loss": 0.77454138, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79688907, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.5545952320098877 + }, + { + "auxiliary_loss_clip": 0.01118175, + "auxiliary_loss_mlp": 0.01104124, + "balance_loss_clip": 1.0017153, + "balance_loss_mlp": 1.00045991, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 1.7811685752837754, + "language_loss": 0.59321952, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61544251, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.6416409015655518 + }, + { + "auxiliary_loss_clip": 0.01135059, + "auxiliary_loss_mlp": 0.0110299, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00046992, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.5052128005079648, + "language_loss": 0.74500513, + "learning_rate": 6.161529762127293e-08, + "loss": 0.7673856, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.615830659866333 + }, + { + "auxiliary_loss_clip": 0.01165126, + "auxiliary_loss_mlp": 0.01102941, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00051641, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 1.9096765245644054, + "language_loss": 0.64734167, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67002237, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.4953207969665527 + }, + { + "auxiliary_loss_clip": 0.01134397, + "auxiliary_loss_mlp": 0.01102674, + "balance_loss_clip": 1.00178182, + "balance_loss_mlp": 1.00043976, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.7255243823590525, + "language_loss": 0.74325323, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76562393, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.6333084106445312 + }, + { + "auxiliary_loss_clip": 0.01133723, + "auxiliary_loss_mlp": 0.01103502, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.00050485, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.7413800004023585, + "language_loss": 0.60815823, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63053048, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 3.955252170562744 + }, + { + "auxiliary_loss_clip": 0.01117399, + "auxiliary_loss_mlp": 0.01103146, + "balance_loss_clip": 1.00171983, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.3344120301746916, + "language_loss": 0.69935036, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72155583, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 2.6522603034973145 + }, + { + "auxiliary_loss_clip": 0.01164921, + "auxiliary_loss_mlp": 0.0110279, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.0003655, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 1.9767272342118387, + "language_loss": 0.73282176, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75549889, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.531829357147217 + }, + { + "auxiliary_loss_clip": 0.01069423, + "auxiliary_loss_mlp": 0.01102715, + "balance_loss_clip": 1.00156724, + "balance_loss_mlp": 1.00048065, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.7417317262907845, + "language_loss": 0.64609587, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66781729, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.8131825923919678 + }, + { + "auxiliary_loss_clip": 0.01143844, + "auxiliary_loss_mlp": 0.00746315, + "balance_loss_clip": 1.00108135, + "balance_loss_mlp": 1.0007931, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7601728001117152, + "language_loss": 0.55191469, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57081628, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.2257540225982666 + }, + { + "auxiliary_loss_clip": 0.01118941, + "auxiliary_loss_mlp": 0.01103678, + "balance_loss_clip": 1.00175786, + "balance_loss_mlp": 1.0003953, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.8574751404157204, + "language_loss": 0.69464552, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71687174, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.615978240966797 + }, + { + "auxiliary_loss_clip": 0.01150545, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.00184572, + "balance_loss_mlp": 1.00041425, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.42595132424877, + "language_loss": 0.75770134, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78024662, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.5893847942352295 + }, + { + "auxiliary_loss_clip": 0.01164917, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.00046706, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.9214027196775771, + "language_loss": 0.83388919, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85656917, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.5390517711639404 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01102747, + "balance_loss_clip": 1.00170231, + "balance_loss_mlp": 1.00041759, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.4714921521196573, + "language_loss": 0.67802137, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70021081, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.6539697647094727 + }, + { + "auxiliary_loss_clip": 0.01164957, + "auxiliary_loss_mlp": 0.01103389, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.0004878, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 5.331963113895232, + "language_loss": 0.62472963, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64741307, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.5441136360168457 + }, + { + "auxiliary_loss_clip": 0.01116761, + "auxiliary_loss_mlp": 0.01102259, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00050235, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4218641534024175, + "language_loss": 0.74521804, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76740825, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.618886709213257 + }, + { + "auxiliary_loss_clip": 0.01133346, + "auxiliary_loss_mlp": 0.00747303, + "balance_loss_clip": 1.0017364, + "balance_loss_mlp": 1.00042558, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 1.9695067722831792, + "language_loss": 0.64706349, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66587007, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 3.9992315769195557 + }, + { + "auxiliary_loss_clip": 0.0114998, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00046873, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.072216978132995, + "language_loss": 0.74632448, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76885992, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.5751187801361084 + }, + { + "auxiliary_loss_clip": 0.01148353, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00057328, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.9381985782895548, + "language_loss": 0.76494014, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78746319, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 3.957277774810791 + }, + { + "auxiliary_loss_clip": 0.01165043, + "auxiliary_loss_mlp": 0.0110299, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.0004698, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.382558139055977, + "language_loss": 0.67569435, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69837463, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.5282163619995117 + }, + { + "auxiliary_loss_clip": 0.01131, + "auxiliary_loss_mlp": 0.0107935, + "balance_loss_clip": 1.00127292, + "balance_loss_mlp": 1.00000429, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7262307520202772, + "language_loss": 0.57722473, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59932828, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 3.106780529022217 + }, + { + "auxiliary_loss_clip": 0.01164744, + "auxiliary_loss_mlp": 0.01102167, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00050497, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 2.3957853054123883, + "language_loss": 0.69922358, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72189265, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 4.010751008987427 + }, + { + "auxiliary_loss_clip": 0.01150315, + "auxiliary_loss_mlp": 0.01103204, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.0006845, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.5614514462223084, + "language_loss": 0.75140059, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77393585, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.516338586807251 + }, + { + "auxiliary_loss_clip": 0.01118696, + "auxiliary_loss_mlp": 0.01102758, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00052369, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.6855185003678155, + "language_loss": 0.64976358, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67197812, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 2.6345133781433105 + }, + { + "auxiliary_loss_clip": 0.01131845, + "auxiliary_loss_mlp": 0.01102714, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00038481, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.677651399163766, + "language_loss": 0.66249627, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68484187, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 2.6325807571411133 + }, + { + "auxiliary_loss_clip": 0.0116039, + "auxiliary_loss_mlp": 0.01079342, + "balance_loss_clip": 1.00115502, + "balance_loss_mlp": 0.99999678, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6607824368826438, + "language_loss": 0.61151701, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63391435, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.089125156402588 + }, + { + "auxiliary_loss_clip": 0.01117104, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_clip": 1.00174963, + "balance_loss_mlp": 1.00053668, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.6893405199695644, + "language_loss": 0.74238384, + "learning_rate": 5.933424178131341e-08, + "loss": 0.76459014, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.5919835567474365 + }, + { + "auxiliary_loss_clip": 0.01165084, + "auxiliary_loss_mlp": 0.01103493, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00049615, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.118207200804196, + "language_loss": 0.62240881, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64509457, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 2.5965986251831055 + }, + { + "auxiliary_loss_clip": 0.01065181, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_clip": 1.00168669, + "balance_loss_mlp": 1.00039554, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 1.8460237679704836, + "language_loss": 0.84154594, + "learning_rate": 5.914606645688591e-08, + "loss": 0.8632164, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.77474308013916 + }, + { + "auxiliary_loss_clip": 0.01164938, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00037432, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.46166299502011, + "language_loss": 0.73431742, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75700527, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.7403981685638428 + }, + { + "auxiliary_loss_clip": 0.01148578, + "auxiliary_loss_mlp": 0.01103818, + "balance_loss_clip": 1.00190485, + "balance_loss_mlp": 1.00043964, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.7451371445293773, + "language_loss": 0.78634202, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80886596, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.542829751968384 + }, + { + "auxiliary_loss_clip": 0.01131444, + "auxiliary_loss_mlp": 0.01103354, + "balance_loss_clip": 1.00163221, + "balance_loss_mlp": 1.00054824, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.551823312938806, + "language_loss": 0.74861336, + "learning_rate": 5.886435545946455e-08, + "loss": 0.7709614, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.5703749656677246 + }, + { + "auxiliary_loss_clip": 0.01133308, + "auxiliary_loss_mlp": 0.01102395, + "balance_loss_clip": 1.00158799, + "balance_loss_mlp": 1.00035143, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.5248946938715793, + "language_loss": 0.75460601, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77696311, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.622448682785034 + }, + { + "auxiliary_loss_clip": 0.01132027, + "auxiliary_loss_mlp": 0.01102644, + "balance_loss_clip": 1.00171828, + "balance_loss_mlp": 1.00060093, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.9123436894733639, + "language_loss": 0.66351211, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68585885, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.5371317863464355 + }, + { + "auxiliary_loss_clip": 0.01164855, + "auxiliary_loss_mlp": 0.01103416, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00051463, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.8517721803748886, + "language_loss": 0.80425358, + "learning_rate": 5.85833069345496e-08, + "loss": 0.8269363, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.5059773921966553 + }, + { + "auxiliary_loss_clip": 0.0115008, + "auxiliary_loss_mlp": 0.01102722, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00058329, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.616983505882938, + "language_loss": 0.7477814, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77030939, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.529442548751831 + }, + { + "auxiliary_loss_clip": 0.01148097, + "auxiliary_loss_mlp": 0.01102576, + "balance_loss_clip": 1.00184929, + "balance_loss_mlp": 1.0007236, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.362922709539063, + "language_loss": 0.70295787, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72546458, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.6524271965026855 + }, + { + "auxiliary_loss_clip": 0.01147853, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_clip": 1.00177789, + "balance_loss_mlp": 1.00057352, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 4.583767917377361, + "language_loss": 0.81891358, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84142399, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.5450706481933594 + }, + { + "auxiliary_loss_clip": 0.01150706, + "auxiliary_loss_mlp": 0.01104298, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00044298, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 2.9649603047869597, + "language_loss": 0.79569387, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81824398, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.533750534057617 + }, + { + "auxiliary_loss_clip": 0.01118973, + "auxiliary_loss_mlp": 0.01103982, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00050807, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 2.6264735709266893, + "language_loss": 0.75376749, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77599704, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.6207425594329834 + }, + { + "auxiliary_loss_clip": 0.01134786, + "auxiliary_loss_mlp": 0.01103273, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00037181, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.889506227352291, + "language_loss": 0.52149904, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54387963, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.6758735179901123 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01102733, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00040412, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.7097501912686117, + "language_loss": 0.77432978, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79700643, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.5666487216949463 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01103659, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00047088, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.8544975343219643, + "language_loss": 0.69175345, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71412766, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.5453579425811768 + }, + { + "auxiliary_loss_clip": 0.01165063, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_clip": 1.00196123, + "balance_loss_mlp": 1.00035167, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.6377011856811907, + "language_loss": 0.7301268, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75280231, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 3.966759443283081 + }, + { + "auxiliary_loss_clip": 0.0110068, + "auxiliary_loss_mlp": 0.01101758, + "balance_loss_clip": 1.00150084, + "balance_loss_mlp": 1.00038278, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 3.249133821839803, + "language_loss": 0.71570587, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73773026, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.644829273223877 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.5701240263872087, + "language_loss": 0.8713209, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89400136, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.5320193767547607 + }, + { + "auxiliary_loss_clip": 0.01160325, + "auxiliary_loss_mlp": 0.01079334, + "balance_loss_clip": 1.00114655, + "balance_loss_mlp": 0.9999885, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.8045479087074251, + "language_loss": 0.55186945, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57426602, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 2.969209909439087 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01104562, + "balance_loss_clip": 1.00172162, + "balance_loss_mlp": 1.00042057, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.881077993275596, + "language_loss": 0.76314473, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78552604, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.6152260303497314 + }, + { + "auxiliary_loss_clip": 0.01133419, + "auxiliary_loss_mlp": 0.01101673, + "balance_loss_clip": 1.00166559, + "balance_loss_mlp": 1.00039291, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.467031137885236, + "language_loss": 0.7832855, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80563647, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.609999179840088 + }, + { + "auxiliary_loss_clip": 0.01146245, + "auxiliary_loss_mlp": 0.01079359, + "balance_loss_clip": 1.00128174, + "balance_loss_mlp": 1.00001323, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.76254607681085, + "language_loss": 0.51365018, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53590626, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 3.058671712875366 + }, + { + "auxiliary_loss_clip": 0.01148063, + "auxiliary_loss_mlp": 0.01101434, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00043964, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.63933569625399, + "language_loss": 0.82534039, + "learning_rate": 5.709557384259378e-08, + "loss": 0.8478353, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.5502169132232666 + }, + { + "auxiliary_loss_clip": 0.0116038, + "auxiliary_loss_mlp": 0.01079354, + "balance_loss_clip": 1.00117338, + "balance_loss_mlp": 1.00000846, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7361751323391946, + "language_loss": 0.51081777, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53321517, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.1453216075897217 + }, + { + "auxiliary_loss_clip": 0.01126875, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_clip": 1.0011425, + "balance_loss_mlp": 1.00003624, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6805358403404839, + "language_loss": 0.58733195, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60939449, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.1452271938323975 + }, + { + "auxiliary_loss_clip": 0.0113193, + "auxiliary_loss_mlp": 0.01103382, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00048029, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 5.981131134095519, + "language_loss": 0.71332586, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73567903, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.567035675048828 + }, + { + "auxiliary_loss_clip": 0.01100153, + "auxiliary_loss_mlp": 0.01103377, + "balance_loss_clip": 1.00151587, + "balance_loss_mlp": 1.00057077, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7168361160299694, + "language_loss": 0.68809009, + "learning_rate": 5.672658701232458e-08, + "loss": 0.71012545, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 4.042182683944702 + }, + { + "auxiliary_loss_clip": 0.01102172, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_clip": 1.0017916, + "balance_loss_mlp": 1.00042665, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.0287566993344437, + "language_loss": 0.76445067, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78650856, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 4.046983957290649 + }, + { + "auxiliary_loss_clip": 0.0111709, + "auxiliary_loss_mlp": 0.01104175, + "balance_loss_clip": 1.00172329, + "balance_loss_mlp": 1.00051022, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 1.9527477709621035, + "language_loss": 0.72499859, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74721122, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.60819149017334 + }, + { + "auxiliary_loss_clip": 0.01130897, + "auxiliary_loss_mlp": 0.0110168, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00049484, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.6177261025684206, + "language_loss": 0.68400264, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70632845, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.8167240619659424 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01103706, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00042284, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 1.9115980033009843, + "language_loss": 0.754287, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77646768, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 4.071494102478027 + }, + { + "auxiliary_loss_clip": 0.01098926, + "auxiliary_loss_mlp": 0.01102928, + "balance_loss_clip": 1.0014559, + "balance_loss_mlp": 1.00040841, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 2.5369160854663715, + "language_loss": 0.8180896, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84010816, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 2.6861984729766846 + }, + { + "auxiliary_loss_clip": 0.01134024, + "auxiliary_loss_mlp": 0.01103335, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00043344, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 2.3876315330161018, + "language_loss": 0.75114155, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77351511, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 2.573305368423462 + }, + { + "auxiliary_loss_clip": 0.01164981, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00044084, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.603075293902003, + "language_loss": 0.66515255, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.68783289, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.6091110706329346 + }, + { + "auxiliary_loss_clip": 0.01084947, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_clip": 1.00160134, + "balance_loss_mlp": 1.00046813, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.7290673812435131, + "language_loss": 0.75990307, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.7817843, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 2.705972909927368 + }, + { + "auxiliary_loss_clip": 0.01148449, + "auxiliary_loss_mlp": 0.0110254, + "balance_loss_clip": 1.00167418, + "balance_loss_mlp": 1.00049651, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.0366970916519165, + "language_loss": 0.81689698, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83940685, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.5420196056365967 + }, + { + "auxiliary_loss_clip": 0.01135865, + "auxiliary_loss_mlp": 0.01103158, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00044739, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.2899306369898718, + "language_loss": 0.54175532, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56414557, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.6161794662475586 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01102866, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00053668, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.5827605742878363, + "language_loss": 0.71769238, + "learning_rate": 5.571795325221807e-08, + "loss": 0.73989069, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.5926270484924316 + }, + { + "auxiliary_loss_clip": 0.01148414, + "auxiliary_loss_mlp": 0.01102924, + "balance_loss_clip": 1.00169373, + "balance_loss_mlp": 1.00040448, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 1.894766499815626, + "language_loss": 0.75312555, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77563894, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.5273585319519043 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01102206, + "balance_loss_clip": 1.00166035, + "balance_loss_mlp": 1.00035357, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.5444039528127584, + "language_loss": 0.76162058, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78412259, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.6564266681671143 + }, + { + "auxiliary_loss_clip": 0.0116464, + "auxiliary_loss_mlp": 0.01101565, + "balance_loss_clip": 1.00167096, + "balance_loss_mlp": 1.00047517, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.8338860477250065, + "language_loss": 0.75585431, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77851635, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.5460894107818604 + }, + { + "auxiliary_loss_clip": 0.01148245, + "auxiliary_loss_mlp": 0.01102992, + "balance_loss_clip": 1.00173402, + "balance_loss_mlp": 1.00037706, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.548031467075152, + "language_loss": 0.76637435, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78888667, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.5644850730895996 + }, + { + "auxiliary_loss_clip": 0.01133934, + "auxiliary_loss_mlp": 0.01102979, + "balance_loss_clip": 1.00190914, + "balance_loss_mlp": 1.00045884, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 2.0016285482867526, + "language_loss": 0.72526777, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74763685, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.6069483757019043 + }, + { + "auxiliary_loss_clip": 0.01150272, + "auxiliary_loss_mlp": 0.01103191, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00048041, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.9283168077702242, + "language_loss": 0.77599841, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79853302, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 2.5839600563049316 + }, + { + "auxiliary_loss_clip": 0.01150306, + "auxiliary_loss_mlp": 0.01103677, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.00039375, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.8516657283813454, + "language_loss": 0.75425482, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77679467, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.536440372467041 + }, + { + "auxiliary_loss_clip": 0.0114592, + "auxiliary_loss_mlp": 0.01079372, + "balance_loss_clip": 1.00117397, + "balance_loss_mlp": 1.00002635, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7797381480835973, + "language_loss": 0.60623765, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62849057, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.911545753479004 + }, + { + "auxiliary_loss_clip": 0.01131047, + "auxiliary_loss_mlp": 0.00747292, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00031948, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.6290122161647922, + "language_loss": 0.70460397, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72338736, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.641841173171997 + }, + { + "auxiliary_loss_clip": 0.01131424, + "auxiliary_loss_mlp": 0.0110332, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.80431630408681, + "language_loss": 0.8304739, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85282135, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.585322856903076 + }, + { + "auxiliary_loss_clip": 0.01118355, + "auxiliary_loss_mlp": 0.01102819, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00048971, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.4561427647365277, + "language_loss": 0.76955128, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79176295, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.7049903869628906 + }, + { + "auxiliary_loss_clip": 0.01116844, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_clip": 1.00162601, + "balance_loss_mlp": 1.00035667, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.1525218486976176, + "language_loss": 0.74365854, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76584995, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.6448299884796143 + }, + { + "auxiliary_loss_clip": 0.01118577, + "auxiliary_loss_mlp": 0.01102261, + "balance_loss_clip": 1.00163615, + "balance_loss_mlp": 1.00050402, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.9504945104927516, + "language_loss": 0.75099176, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77320015, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.6028997898101807 + }, + { + "auxiliary_loss_clip": 0.01150142, + "auxiliary_loss_mlp": 0.0110408, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00051141, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.4329205821905684, + "language_loss": 0.76637608, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78891826, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.7113609313964844 + }, + { + "auxiliary_loss_clip": 0.01148293, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00055146, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.5581611780206046, + "language_loss": 0.7079823, + "learning_rate": 5.4356921308363e-08, + "loss": 0.73049116, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.580500602722168 + }, + { + "auxiliary_loss_clip": 0.01098406, + "auxiliary_loss_mlp": 0.01102901, + "balance_loss_clip": 1.00161123, + "balance_loss_mlp": 1.00047636, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.410546602198684, + "language_loss": 0.82868183, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.85069489, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 4.0405051708221436 + }, + { + "auxiliary_loss_clip": 0.01164816, + "auxiliary_loss_mlp": 0.01101407, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.774793561055916, + "language_loss": 0.66505784, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68772006, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.519167184829712 + }, + { + "auxiliary_loss_clip": 0.0113328, + "auxiliary_loss_mlp": 0.01102061, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.00039959, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.6321724713110344, + "language_loss": 0.69114637, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71349978, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.5545389652252197 + }, + { + "auxiliary_loss_clip": 0.01165029, + "auxiliary_loss_mlp": 0.0110302, + "balance_loss_clip": 1.00193322, + "balance_loss_mlp": 1.00059497, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 1.8585354466662278, + "language_loss": 0.72065127, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74333173, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.4456331729888916 + }, + { + "auxiliary_loss_clip": 0.01133225, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_clip": 1.00170767, + "balance_loss_mlp": 1.00050247, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.0099489399067516, + "language_loss": 0.67112076, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69347078, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.6039953231811523 + }, + { + "auxiliary_loss_clip": 0.01148288, + "auxiliary_loss_mlp": 0.01103692, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00040901, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 3.584163813650611, + "language_loss": 0.71280116, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73532093, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.485926628112793 + }, + { + "auxiliary_loss_clip": 0.01165018, + "auxiliary_loss_mlp": 0.01103495, + "balance_loss_clip": 1.00188148, + "balance_loss_mlp": 1.00040317, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 2.796994667676195, + "language_loss": 0.64631665, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66900182, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.523157835006714 + }, + { + "auxiliary_loss_clip": 0.01133402, + "auxiliary_loss_mlp": 0.01103113, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00049758, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.6183222402853508, + "language_loss": 0.70380318, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72616833, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.6297435760498047 + }, + { + "auxiliary_loss_clip": 0.01119202, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00028145, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.7794931133830616, + "language_loss": 0.7698893, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78855526, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.6364550590515137 + }, + { + "auxiliary_loss_clip": 0.01150162, + "auxiliary_loss_mlp": 0.01101476, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00038671, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.564310718431525, + "language_loss": 0.64590746, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66842389, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 2.553220748901367 + }, + { + "auxiliary_loss_clip": 0.01114347, + "auxiliary_loss_mlp": 0.01103352, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00045085, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 2.043533656652125, + "language_loss": 0.80792153, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83009857, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.5909602642059326 + }, + { + "auxiliary_loss_clip": 0.01149899, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00047851, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.7635460962100062, + "language_loss": 0.65553463, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67450768, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 3.9466347694396973 + }, + { + "auxiliary_loss_clip": 0.0111712, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00051999, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 1.9418806779623956, + "language_loss": 0.73239589, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.7546013, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 3.9966588020324707 + }, + { + "auxiliary_loss_clip": 0.01150719, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00056064, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.4922586186090685, + "language_loss": 0.71141177, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73394305, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.5487656593322754 + }, + { + "auxiliary_loss_clip": 0.01086408, + "auxiliary_loss_mlp": 0.01104022, + "balance_loss_clip": 1.00152564, + "balance_loss_mlp": 1.00035715, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.666680263138198, + "language_loss": 0.69099224, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71289653, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.6953370571136475 + }, + { + "auxiliary_loss_clip": 0.01164697, + "auxiliary_loss_mlp": 0.01101436, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00044215, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.496107964548218, + "language_loss": 0.7215789, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74424022, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 3.9343554973602295 + }, + { + "auxiliary_loss_clip": 0.01164888, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00035954, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.4873291330874898, + "language_loss": 0.74113536, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76381594, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.4887852668762207 + }, + { + "auxiliary_loss_clip": 0.01164864, + "auxiliary_loss_mlp": 0.0074736, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00044036, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 1.995864166353144, + "language_loss": 0.67765939, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69678164, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.590325355529785 + }, + { + "auxiliary_loss_clip": 0.01116375, + "auxiliary_loss_mlp": 0.01102858, + "balance_loss_clip": 1.00164318, + "balance_loss_mlp": 1.0003382, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.2505749673571978, + "language_loss": 0.71927267, + "learning_rate": 5.265677957368875e-08, + "loss": 0.74146497, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.592648506164551 + }, + { + "auxiliary_loss_clip": 0.01135288, + "auxiliary_loss_mlp": 0.01103844, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.000561, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 1.888342835597543, + "language_loss": 0.7300306, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75242192, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.5903866291046143 + }, + { + "auxiliary_loss_clip": 0.01133543, + "auxiliary_loss_mlp": 0.01102651, + "balance_loss_clip": 1.00182045, + "balance_loss_mlp": 1.00032127, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.714176886014067, + "language_loss": 0.74290055, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76526248, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.5559442043304443 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01079739, + "balance_loss_clip": 1.00114346, + "balance_loss_mlp": 1.00001228, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8640668202785825, + "language_loss": 0.60713327, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62900937, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.094805955886841 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01103084, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 2.1063342160892047, + "language_loss": 0.69089949, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71328437, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 2.5486011505126953 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01103107, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00049162, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.758634648389413, + "language_loss": 0.64164042, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66381592, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.612363576889038 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01103334, + "balance_loss_clip": 1.00170517, + "balance_loss_mlp": 1.0005281, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.74055973852177, + "language_loss": 0.68665051, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70869982, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.6049582958221436 + }, + { + "auxiliary_loss_clip": 0.01131727, + "auxiliary_loss_mlp": 0.01103317, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00051069, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 1.9690545621588131, + "language_loss": 0.8104431, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83279353, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.543826103210449 + }, + { + "auxiliary_loss_clip": 0.01149373, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00055981, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.7085958822497238, + "language_loss": 0.71837085, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74089348, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.5505428314208984 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.0110288, + "balance_loss_clip": 1.00187171, + "balance_loss_mlp": 1.00055075, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.4329000731278523, + "language_loss": 0.59085923, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.6130715, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 2.5854275226593018 + }, + { + "auxiliary_loss_clip": 0.0110214, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00160408, + "balance_loss_mlp": 1.00057793, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 2.9814869761208933, + "language_loss": 0.80609906, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82815909, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.677112102508545 + }, + { + "auxiliary_loss_clip": 0.0113251, + "auxiliary_loss_mlp": 0.01102535, + "balance_loss_clip": 1.00170732, + "balance_loss_mlp": 1.0003016, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 2.3351065318050277, + "language_loss": 0.78319514, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80554557, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.5771429538726807 + }, + { + "auxiliary_loss_clip": 0.01120568, + "auxiliary_loss_mlp": 0.01103288, + "balance_loss_clip": 1.00175786, + "balance_loss_mlp": 1.00038671, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.8409673658408134, + "language_loss": 0.62546957, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64770812, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.5871641635894775 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01102437, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00039411, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.318858905789393, + "language_loss": 0.70922858, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73160279, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.6973698139190674 + }, + { + "auxiliary_loss_clip": 0.01135831, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00037098, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 1.849419788969821, + "language_loss": 0.77167642, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79406554, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.5469210147857666 + }, + { + "auxiliary_loss_clip": 0.01078171, + "auxiliary_loss_mlp": 0.01079328, + "balance_loss_clip": 1.00071084, + "balance_loss_mlp": 0.99998254, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.7007141304828003, + "language_loss": 0.56495667, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58653164, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.428528308868408 + }, + { + "auxiliary_loss_clip": 0.01133409, + "auxiliary_loss_mlp": 0.01103818, + "balance_loss_clip": 1.00167656, + "balance_loss_mlp": 1.00063038, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.4104134826469732, + "language_loss": 0.7287429, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.7511152, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 2.793396234512329 + }, + { + "auxiliary_loss_clip": 0.01130771, + "auxiliary_loss_mlp": 0.01103497, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.0005002, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 2.2308008050677213, + "language_loss": 0.71821547, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.74055815, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.600956678390503 + }, + { + "auxiliary_loss_clip": 0.01150006, + "auxiliary_loss_mlp": 0.01103474, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00038147, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 2.986746494392538, + "language_loss": 0.75661188, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77914667, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.536717414855957 + }, + { + "auxiliary_loss_clip": 0.01131886, + "auxiliary_loss_mlp": 0.01103031, + "balance_loss_clip": 1.00176191, + "balance_loss_mlp": 1.00051141, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 2.3047283034704913, + "language_loss": 0.75915277, + "learning_rate": 5.098329529416379e-08, + "loss": 0.78150201, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.6464314460754395 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01102241, + "balance_loss_clip": 1.00177193, + "balance_loss_mlp": 1.00048375, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.4710279254222607, + "language_loss": 0.74781108, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76995409, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 4.043294668197632 + }, + { + "auxiliary_loss_clip": 0.01148335, + "auxiliary_loss_mlp": 0.01102374, + "balance_loss_clip": 1.00160217, + "balance_loss_mlp": 1.00042665, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.4374980695894615, + "language_loss": 0.69874573, + "learning_rate": 5.080869070341487e-08, + "loss": 0.7212528, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.4990029335021973 + }, + { + "auxiliary_loss_clip": 0.01133438, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00048101, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.6812256612154393, + "language_loss": 0.88528442, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90763927, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.577747344970703 + }, + { + "auxiliary_loss_clip": 0.01131947, + "auxiliary_loss_mlp": 0.01103735, + "balance_loss_clip": 1.00161862, + "balance_loss_mlp": 1.00045228, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 2.03421403080083, + "language_loss": 0.64204133, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66439819, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.5704421997070312 + }, + { + "auxiliary_loss_clip": 0.01165052, + "auxiliary_loss_mlp": 0.0110248, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.00062728, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 2.0672182307741647, + "language_loss": 0.74636364, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76903892, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.522688865661621 + }, + { + "auxiliary_loss_clip": 0.01150266, + "auxiliary_loss_mlp": 0.01102559, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00042057, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.0974839007569654, + "language_loss": 0.66166127, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68418956, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.5903358459472656 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.01102483, + "balance_loss_clip": 1.00163925, + "balance_loss_mlp": 1.00044036, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.069189059045405, + "language_loss": 0.69005251, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71207285, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.626387357711792 + }, + { + "auxiliary_loss_clip": 0.0113278, + "auxiliary_loss_mlp": 0.01102468, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00042486, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.590679487826741, + "language_loss": 0.58579814, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.6081506, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.6049537658691406 + }, + { + "auxiliary_loss_clip": 0.01132003, + "auxiliary_loss_mlp": 0.01104098, + "balance_loss_clip": 1.00177872, + "balance_loss_mlp": 1.00033808, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 1.8474774898369095, + "language_loss": 0.78650033, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80886132, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.5345518589019775 + }, + { + "auxiliary_loss_clip": 0.01164917, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00038302, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 6.049324137654676, + "language_loss": 0.68901217, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.71168458, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.605868101119995 + }, + { + "auxiliary_loss_clip": 0.01165012, + "auxiliary_loss_mlp": 0.01102742, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00050855, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 2.6933154782909985, + "language_loss": 0.6793893, + "learning_rate": 5.002662914604583e-08, + "loss": 0.70206678, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 3.9736812114715576 + }, + { + "auxiliary_loss_clip": 0.01134669, + "auxiliary_loss_mlp": 0.01102308, + "balance_loss_clip": 1.00166249, + "balance_loss_mlp": 1.00026464, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.7525640688183612, + "language_loss": 0.74577165, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76814145, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.580915927886963 + }, + { + "auxiliary_loss_clip": 0.01149641, + "auxiliary_loss_mlp": 0.01102252, + "balance_loss_clip": 1.00175679, + "balance_loss_mlp": 1.00049543, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.7765481846804343, + "language_loss": 0.80061352, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82313246, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 3.9367144107818604 + }, + { + "auxiliary_loss_clip": 0.01131666, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_clip": 1.00154746, + "balance_loss_mlp": 1.00041485, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.9161495247574771, + "language_loss": 0.74258626, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76493132, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.535614490509033 + }, + { + "auxiliary_loss_clip": 0.01131088, + "auxiliary_loss_mlp": 0.01103434, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00053287, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.2919722484219927, + "language_loss": 0.76177478, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78412008, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.558448076248169 + }, + { + "auxiliary_loss_clip": 0.01103542, + "auxiliary_loss_mlp": 0.01102957, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00043678, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 2.103853831231098, + "language_loss": 0.78355974, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80562478, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 4.131332874298096 + }, + { + "auxiliary_loss_clip": 0.01114871, + "auxiliary_loss_mlp": 0.01103795, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00041676, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 1.979240242840911, + "language_loss": 0.77271616, + "learning_rate": 4.950858206945674e-08, + "loss": 0.7949028, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.6339776515960693 + }, + { + "auxiliary_loss_clip": 0.01118323, + "auxiliary_loss_mlp": 0.01102335, + "balance_loss_clip": 1.00161636, + "balance_loss_mlp": 1.00038695, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.438929305444127, + "language_loss": 0.6729117, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69511831, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.7530505657196045 + }, + { + "auxiliary_loss_clip": 0.01131316, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_clip": 1.00161719, + "balance_loss_mlp": 1.00038171, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 2.107160050286646, + "language_loss": 0.75253487, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77486843, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.620096206665039 + }, + { + "auxiliary_loss_clip": 0.01165057, + "auxiliary_loss_mlp": 0.01104018, + "balance_loss_clip": 1.00190675, + "balance_loss_mlp": 1.00054407, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 2.099823348715483, + "language_loss": 0.80530274, + "learning_rate": 4.925055698519931e-08, + "loss": 0.82799345, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 2.4840993881225586 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01103144, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00043297, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 2.087800579646566, + "language_loss": 0.72108817, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74309999, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.65614652633667 + }, + { + "auxiliary_loss_clip": 0.01132455, + "auxiliary_loss_mlp": 0.00747214, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00031114, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.8615024776425746, + "language_loss": 0.74377978, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76257646, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.610199451446533 + }, + { + "auxiliary_loss_clip": 0.01145962, + "auxiliary_loss_mlp": 0.01079366, + "balance_loss_clip": 1.00111961, + "balance_loss_mlp": 1.00002003, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.7134766935197032, + "language_loss": 0.53464985, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55690312, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 2.994382619857788 + }, + { + "auxiliary_loss_clip": 0.01149868, + "auxiliary_loss_mlp": 0.01102577, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00043857, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.920641932703452, + "language_loss": 0.70651829, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72904277, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.5275797843933105 + }, + { + "auxiliary_loss_clip": 0.01145574, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_clip": 1.00188696, + "balance_loss_mlp": 1.00036275, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.5181076813030827, + "language_loss": 0.67876065, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70125473, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.6050806045532227 + }, + { + "auxiliary_loss_clip": 0.01164823, + "auxiliary_loss_mlp": 0.01102363, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00041497, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.9095371754455663, + "language_loss": 0.61510575, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63777757, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.574324369430542 + }, + { + "auxiliary_loss_clip": 0.01148692, + "auxiliary_loss_mlp": 0.01102837, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00050831, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.5639711795746545, + "language_loss": 0.77060658, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79312181, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.6483447551727295 + }, + { + "auxiliary_loss_clip": 0.0114879, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00037217, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.7173784725744998, + "language_loss": 0.66607058, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68503141, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.585726022720337 + }, + { + "auxiliary_loss_clip": 0.01131915, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_clip": 1.00178576, + "balance_loss_mlp": 1.00059152, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7390798202775712, + "language_loss": 0.79870343, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82105273, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.5814085006713867 + }, + { + "auxiliary_loss_clip": 0.01101301, + "auxiliary_loss_mlp": 0.01101726, + "balance_loss_clip": 1.00157166, + "balance_loss_mlp": 1.00054073, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.6329287641484866, + "language_loss": 0.76847959, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79050982, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.6744918823242188 + }, + { + "auxiliary_loss_clip": 0.01115713, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.0015223, + "balance_loss_mlp": 1.00036716, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.549656849297848, + "language_loss": 0.72230828, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74447525, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.628516912460327 + }, + { + "auxiliary_loss_clip": 0.01165121, + "auxiliary_loss_mlp": 0.01103595, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00040746, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.5975383170007607, + "language_loss": 0.66148901, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68417621, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.5016396045684814 + }, + { + "auxiliary_loss_clip": 0.01148292, + "auxiliary_loss_mlp": 0.00747431, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00039208, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.4060961627734558, + "language_loss": 0.65928298, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67824018, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.5992941856384277 + }, + { + "auxiliary_loss_clip": 0.01118898, + "auxiliary_loss_mlp": 0.01103451, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00045395, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.5106173271191203, + "language_loss": 0.75006557, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77228904, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.6130335330963135 + }, + { + "auxiliary_loss_clip": 0.01148506, + "auxiliary_loss_mlp": 0.00747385, + "balance_loss_clip": 1.00192952, + "balance_loss_mlp": 1.00042534, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.7642640882552545, + "language_loss": 0.71168202, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73064095, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.5738883018493652 + }, + { + "auxiliary_loss_clip": 0.01133113, + "auxiliary_loss_mlp": 0.0110301, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00039458, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.7112698280944085, + "language_loss": 0.75091791, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77327913, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.5484097003936768 + }, + { + "auxiliary_loss_clip": 0.01117975, + "auxiliary_loss_mlp": 0.011018, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00042415, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 2.367247022269174, + "language_loss": 0.83587468, + "learning_rate": 4.780099275981597e-08, + "loss": 0.8580724, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.632368326187134 + }, + { + "auxiliary_loss_clip": 0.01164876, + "auxiliary_loss_mlp": 0.01102007, + "balance_loss_clip": 1.00180542, + "balance_loss_mlp": 1.00044024, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.4483428565668055, + "language_loss": 0.67642677, + "learning_rate": 4.771639036957742e-08, + "loss": 0.69909561, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 2.5099830627441406 + }, + { + "auxiliary_loss_clip": 0.0111881, + "auxiliary_loss_mlp": 0.0110203, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00046396, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6478462768252067, + "language_loss": 0.7211594, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74336779, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 4.120968341827393 + }, + { + "auxiliary_loss_clip": 0.01148211, + "auxiliary_loss_mlp": 0.01102612, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00033057, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.9199333511683512, + "language_loss": 0.74637419, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76888239, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.5687904357910156 + }, + { + "auxiliary_loss_clip": 0.01150272, + "auxiliary_loss_mlp": 0.01102764, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00043452, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.655496330624585, + "language_loss": 0.70336384, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72589421, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.599222421646118 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00057387, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.8541679667801925, + "language_loss": 0.78254336, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80491209, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.565737009048462 + }, + { + "auxiliary_loss_clip": 0.01164955, + "auxiliary_loss_mlp": 0.01103449, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00045228, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.5044243986389667, + "language_loss": 0.80325741, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82594144, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.524658203125 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01104222, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00055814, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 1.9499705028418473, + "language_loss": 0.79754329, + "learning_rate": 4.721033078682768e-08, + "loss": 0.8198781, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.546090602874756 + }, + { + "auxiliary_loss_clip": 0.01128836, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.6899197353653697, + "language_loss": 0.71587718, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73818886, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.757296562194824 + }, + { + "auxiliary_loss_clip": 0.01133658, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_clip": 1.00174046, + "balance_loss_mlp": 1.00038421, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.5477462067569023, + "language_loss": 0.80275512, + "learning_rate": 4.704223662500806e-08, + "loss": 0.8251245, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.5408544540405273 + }, + { + "auxiliary_loss_clip": 0.01120606, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.6079136385554023, + "language_loss": 0.80819726, + "learning_rate": 4.695830062703643e-08, + "loss": 0.83043671, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.587538480758667 + }, + { + "auxiliary_loss_clip": 0.01131519, + "auxiliary_loss_mlp": 0.01102541, + "balance_loss_clip": 1.00168252, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 2.6269128753729856, + "language_loss": 0.74672544, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76906598, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.5561790466308594 + }, + { + "auxiliary_loss_clip": 0.01133418, + "auxiliary_loss_mlp": 0.0110257, + "balance_loss_clip": 1.00173402, + "balance_loss_mlp": 1.00052643, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 1.9742872154806408, + "language_loss": 0.75806379, + "learning_rate": 4.679065081288458e-08, + "loss": 0.78042364, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 4.002460956573486 + }, + { + "auxiliary_loss_clip": 0.0108551, + "auxiliary_loss_mlp": 0.011019, + "balance_loss_clip": 1.0014801, + "balance_loss_mlp": 1.00042892, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 1.9822367115230746, + "language_loss": 0.83212507, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.8539992, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 4.089586496353149 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01102273, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.0005157, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.774501149919806, + "language_loss": 0.7619912, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.7845149, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.5566258430480957 + }, + { + "auxiliary_loss_clip": 0.01150373, + "auxiliary_loss_mlp": 0.01102255, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00040221, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.7297148422250983, + "language_loss": 0.77597415, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79850042, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.5946991443634033 + }, + { + "auxiliary_loss_clip": 0.0111749, + "auxiliary_loss_mlp": 0.00747372, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00041533, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.6350650997250393, + "language_loss": 0.63440615, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.65305477, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.6267013549804688 + }, + { + "auxiliary_loss_clip": 0.01133429, + "auxiliary_loss_mlp": 0.01102576, + "balance_loss_clip": 1.0017668, + "balance_loss_mlp": 1.00043786, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.4787643881534067, + "language_loss": 0.67785001, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70021003, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 4.025188446044922 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01103597, + "balance_loss_clip": 1.00172055, + "balance_loss_mlp": 1.00059974, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.4980585378924138, + "language_loss": 0.73860157, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76065981, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.7063465118408203 + }, + { + "auxiliary_loss_clip": 0.01101597, + "auxiliary_loss_mlp": 0.01102262, + "balance_loss_clip": 1.00152671, + "balance_loss_mlp": 1.00060058, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.8337592936373404, + "language_loss": 0.8396368, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.86167538, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.6767029762268066 + }, + { + "auxiliary_loss_clip": 0.01099618, + "auxiliary_loss_mlp": 0.01102136, + "balance_loss_clip": 1.00166368, + "balance_loss_mlp": 1.00037837, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.8176074207092765, + "language_loss": 0.69109482, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71311235, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.659193992614746 + }, + { + "auxiliary_loss_clip": 0.01165207, + "auxiliary_loss_mlp": 0.01103458, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00046086, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 3.477526699353173, + "language_loss": 0.65225035, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67493701, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 2.4683563709259033 + }, + { + "auxiliary_loss_clip": 0.01165001, + "auxiliary_loss_mlp": 0.0110331, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00040829, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.7452982422149652, + "language_loss": 0.74652481, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.76920795, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.47483491897583 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01102601, + "balance_loss_clip": 1.00154209, + "balance_loss_mlp": 1.00046206, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.7167952969094786, + "language_loss": 0.62813771, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65018284, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.6267812252044678 + }, + { + "auxiliary_loss_clip": 0.01131658, + "auxiliary_loss_mlp": 0.01101225, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00051665, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.8944314752450928, + "language_loss": 0.72283041, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74515927, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.573326349258423 + }, + { + "auxiliary_loss_clip": 0.01135236, + "auxiliary_loss_mlp": 0.01103688, + "balance_loss_clip": 1.001791, + "balance_loss_mlp": 1.00040531, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 2.0595977320021523, + "language_loss": 0.70823377, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73062301, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.625195026397705 + }, + { + "auxiliary_loss_clip": 0.01165043, + "auxiliary_loss_mlp": 0.00747358, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00034404, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.5668195662276576, + "language_loss": 0.73239297, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75151694, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.525465965270996 + }, + { + "auxiliary_loss_clip": 0.01116191, + "auxiliary_loss_mlp": 0.01102504, + "balance_loss_clip": 1.00165665, + "balance_loss_mlp": 1.00046134, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 2.740668768745931, + "language_loss": 0.79557109, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81775802, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.5850484371185303 + }, + { + "auxiliary_loss_clip": 0.01164704, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00052285, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 2.131879192586003, + "language_loss": 0.74051023, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76317716, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.545156240463257 + }, + { + "auxiliary_loss_clip": 0.01148756, + "auxiliary_loss_mlp": 0.01104106, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00053668, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.9736095181548898, + "language_loss": 0.77274942, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79527807, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.5977864265441895 + }, + { + "auxiliary_loss_clip": 0.01118353, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_clip": 1.0016489, + "balance_loss_mlp": 1.00039124, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.4742330383243292, + "language_loss": 0.80539238, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82759547, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 2.6557507514953613 + }, + { + "auxiliary_loss_clip": 0.01132746, + "auxiliary_loss_mlp": 0.01103828, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00044966, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.664843829558193, + "language_loss": 0.7812016, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.80356735, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.6279327869415283 + }, + { + "auxiliary_loss_clip": 0.01131658, + "auxiliary_loss_mlp": 0.01102087, + "balance_loss_clip": 1.00164104, + "balance_loss_mlp": 1.00042558, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.463058460913246, + "language_loss": 0.73409462, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75643206, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.5978851318359375 + }, + { + "auxiliary_loss_clip": 0.01098619, + "auxiliary_loss_mlp": 0.01102557, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00041819, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.4373336220568333, + "language_loss": 0.64692062, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66893238, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.7913591861724854 + }, + { + "auxiliary_loss_clip": 0.01148527, + "auxiliary_loss_mlp": 0.01103158, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.00044692, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.717685287391027, + "language_loss": 0.76388729, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78640413, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.5508344173431396 + }, + { + "auxiliary_loss_clip": 0.01148602, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.0004704, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.9511270286660143, + "language_loss": 0.66810095, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.69062078, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.587460517883301 + }, + { + "auxiliary_loss_clip": 0.01117322, + "auxiliary_loss_mlp": 0.01103315, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00041389, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 2.0768164287225916, + "language_loss": 0.69581103, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71801746, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.6624503135681152 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01103677, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00039399, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 2.139456089141592, + "language_loss": 0.69478691, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71732581, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.6017768383026123 + }, + { + "auxiliary_loss_clip": 0.01150121, + "auxiliary_loss_mlp": 0.0110397, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00040078, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 2.0850415885119893, + "language_loss": 0.77271199, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79525292, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.5565004348754883 + }, + { + "auxiliary_loss_clip": 0.01148319, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.00033128, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.417039758147514, + "language_loss": 0.68747163, + "learning_rate": 4.455638541847495e-08, + "loss": 0.70999002, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 3.9801738262176514 + }, + { + "auxiliary_loss_clip": 0.0111901, + "auxiliary_loss_mlp": 0.01101611, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00042653, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.7887616042751988, + "language_loss": 0.82189918, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84410536, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.7114882469177246 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01102483, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00053537, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.7641673880937336, + "language_loss": 0.83662999, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85915339, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.5083727836608887 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01104222, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00046206, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.5444795939426634, + "language_loss": 0.65177089, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67429709, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.661877155303955 + }, + { + "auxiliary_loss_clip": 0.01148378, + "auxiliary_loss_mlp": 0.0110433, + "balance_loss_clip": 1.00191677, + "balance_loss_mlp": 1.00056982, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.842590241537014, + "language_loss": 0.80060577, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82313287, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.5438969135284424 + }, + { + "auxiliary_loss_clip": 0.01148237, + "auxiliary_loss_mlp": 0.0110246, + "balance_loss_clip": 1.00197077, + "balance_loss_mlp": 1.00051236, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.6549998901447398, + "language_loss": 0.75447965, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.7769866, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.5512237548828125 + }, + { + "auxiliary_loss_clip": 0.01086507, + "auxiliary_loss_mlp": 0.01101716, + "balance_loss_clip": 1.00162125, + "balance_loss_mlp": 1.0005312, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.730578837810969, + "language_loss": 0.73769605, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75957835, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.7326838970184326 + }, + { + "auxiliary_loss_clip": 0.01101761, + "auxiliary_loss_mlp": 0.01103357, + "balance_loss_clip": 1.00169122, + "balance_loss_mlp": 1.00064635, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.9544476621134765, + "language_loss": 0.77279484, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79484606, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.6568827629089355 + }, + { + "auxiliary_loss_clip": 0.01119189, + "auxiliary_loss_mlp": 0.0110368, + "balance_loss_clip": 1.00152421, + "balance_loss_mlp": 1.00058806, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.8910034998060719, + "language_loss": 0.78494, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80716872, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.5984833240509033 + }, + { + "auxiliary_loss_clip": 0.01133047, + "auxiliary_loss_mlp": 0.01101315, + "balance_loss_clip": 1.00169146, + "balance_loss_mlp": 1.00041676, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.515449756809629, + "language_loss": 0.69342673, + "learning_rate": 4.382363965244695e-08, + "loss": 0.7157703, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 2.5526227951049805 + }, + { + "auxiliary_loss_clip": 0.01049726, + "auxiliary_loss_mlp": 0.01103582, + "balance_loss_clip": 1.0016886, + "balance_loss_mlp": 1.00058508, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.7725419489832057, + "language_loss": 0.75688511, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77841818, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 4.359264612197876 + }, + { + "auxiliary_loss_clip": 0.01132984, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_clip": 1.00169122, + "balance_loss_mlp": 1.00045085, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.416809454106827, + "language_loss": 0.71938747, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74174511, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 2.9554948806762695 + }, + { + "auxiliary_loss_clip": 0.01164905, + "auxiliary_loss_mlp": 0.0110303, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00041437, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.5312648761634287, + "language_loss": 0.63168871, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65436804, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 4.559262752532959 + }, + { + "auxiliary_loss_clip": 0.0113184, + "auxiliary_loss_mlp": 0.01102916, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00039613, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 1.9170308344782145, + "language_loss": 0.73319715, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75554472, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.632276773452759 + }, + { + "auxiliary_loss_clip": 0.01099023, + "auxiliary_loss_mlp": 0.00747358, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00036979, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.7026386396346755, + "language_loss": 0.63868594, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65714979, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.8183095455169678 + }, + { + "auxiliary_loss_clip": 0.0107125, + "auxiliary_loss_mlp": 0.01104981, + "balance_loss_clip": 1.00164735, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 1.952763438957794, + "language_loss": 0.6366452, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.65840757, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 2.735711097717285 + }, + { + "auxiliary_loss_clip": 0.01164877, + "auxiliary_loss_mlp": 0.0110326, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.7485191982409658, + "language_loss": 0.75335705, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77603835, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 4.155558347702026 + }, + { + "auxiliary_loss_clip": 0.0114313, + "auxiliary_loss_mlp": 0.01079025, + "balance_loss_clip": 1.0010004, + "balance_loss_mlp": 1.00006092, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9616645097346127, + "language_loss": 0.62379062, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64601219, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 3.019216299057007 + }, + { + "auxiliary_loss_clip": 0.01097802, + "auxiliary_loss_mlp": 0.01102371, + "balance_loss_clip": 1.00178683, + "balance_loss_mlp": 1.00056577, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.4505674875370895, + "language_loss": 0.78209507, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80409682, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.6864776611328125 + }, + { + "auxiliary_loss_clip": 0.01165026, + "auxiliary_loss_mlp": 0.011036, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.00050747, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.91453124207148, + "language_loss": 0.78336477, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80605102, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 2.5147173404693604 + }, + { + "auxiliary_loss_clip": 0.01148245, + "auxiliary_loss_mlp": 0.01102028, + "balance_loss_clip": 1.00182498, + "balance_loss_mlp": 1.00036633, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 2.6901032225065773, + "language_loss": 0.72075117, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74325383, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 2.564772367477417 + }, + { + "auxiliary_loss_clip": 0.01117039, + "auxiliary_loss_mlp": 0.00747344, + "balance_loss_clip": 1.00166988, + "balance_loss_mlp": 1.00045252, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.7758673610425486, + "language_loss": 0.67591119, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69455498, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.6281018257141113 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.011034, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00049818, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 1.94974405918794, + "language_loss": 0.62178022, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64415514, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 2.675987958908081 + }, + { + "auxiliary_loss_clip": 0.01135362, + "auxiliary_loss_mlp": 0.01103065, + "balance_loss_clip": 1.00176537, + "balance_loss_mlp": 1.00054502, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.673901478219204, + "language_loss": 0.78646833, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80885261, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.624673843383789 + }, + { + "auxiliary_loss_clip": 0.0113318, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00054169, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.3926100232332317, + "language_loss": 0.69470984, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71707797, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.5741922855377197 + }, + { + "auxiliary_loss_clip": 0.01148667, + "auxiliary_loss_mlp": 0.01103232, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00042641, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.877099613001129, + "language_loss": 0.79018533, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.81270432, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.5185670852661133 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.0110281, + "balance_loss_clip": 1.00167584, + "balance_loss_mlp": 1.00048041, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 1.8209369874094572, + "language_loss": 0.77719301, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79940784, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.6015777587890625 + }, + { + "auxiliary_loss_clip": 0.01133034, + "auxiliary_loss_mlp": 0.01101865, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00048923, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 2.8946348898058574, + "language_loss": 0.77722502, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79957408, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.582152843475342 + }, + { + "auxiliary_loss_clip": 0.01120856, + "auxiliary_loss_mlp": 0.01102959, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00034356, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.56450186477936, + "language_loss": 0.74728346, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76952159, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.630934953689575 + }, + { + "auxiliary_loss_clip": 0.01100507, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_clip": 1.00178802, + "balance_loss_mlp": 1.00056243, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.8363170095457506, + "language_loss": 0.68376732, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70580029, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.6976189613342285 + }, + { + "auxiliary_loss_clip": 0.01133336, + "auxiliary_loss_mlp": 0.01101801, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00052106, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 2.176632287274855, + "language_loss": 0.64795506, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67030644, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.562486410140991 + }, + { + "auxiliary_loss_clip": 0.01150276, + "auxiliary_loss_mlp": 0.01103237, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00033593, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 7.190312331900816, + "language_loss": 0.75857031, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.7811054, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.500889778137207 + }, + { + "auxiliary_loss_clip": 0.01101797, + "auxiliary_loss_mlp": 0.01102805, + "balance_loss_clip": 1.00162864, + "balance_loss_mlp": 1.00038004, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 1.7250952998355666, + "language_loss": 0.52257997, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.544626, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.7151637077331543 + }, + { + "auxiliary_loss_clip": 0.01087369, + "auxiliary_loss_mlp": 0.01101564, + "balance_loss_clip": 1.00168657, + "balance_loss_mlp": 1.00066495, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.4882367267371368, + "language_loss": 0.70706904, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72895837, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.7661707401275635 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01102888, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00036764, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.7464535043304443, + "language_loss": 0.76273966, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78510499, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.6083970069885254 + }, + { + "auxiliary_loss_clip": 0.01150345, + "auxiliary_loss_mlp": 0.01103754, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00037599, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.5878339339213365, + "language_loss": 0.66430414, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68684512, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.5461132526397705 + }, + { + "auxiliary_loss_clip": 0.01148622, + "auxiliary_loss_mlp": 0.0110247, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00042701, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.4933358643533146, + "language_loss": 0.76525956, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78777045, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.5648365020751953 + }, + { + "auxiliary_loss_clip": 0.01070778, + "auxiliary_loss_mlp": 0.01102933, + "balance_loss_clip": 1.00148988, + "balance_loss_mlp": 1.00050819, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 2.0267007710076794, + "language_loss": 0.73601371, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75775087, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 4.158198356628418 + }, + { + "auxiliary_loss_clip": 0.01165226, + "auxiliary_loss_mlp": 0.01103599, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.0005064, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 1.905642233740072, + "language_loss": 0.84485483, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86754304, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.5448882579803467 + }, + { + "auxiliary_loss_clip": 0.01148751, + "auxiliary_loss_mlp": 0.00747473, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00044751, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.4522408831973108, + "language_loss": 0.72091526, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.73987746, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.704364776611328 + }, + { + "auxiliary_loss_clip": 0.01116534, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_clip": 1.00161171, + "balance_loss_mlp": 1.00029206, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.912886332929814, + "language_loss": 0.80522001, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82740015, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.6225836277008057 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01103727, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00063443, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.5251876501909767, + "language_loss": 0.76565623, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78786331, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.6263673305511475 + }, + { + "auxiliary_loss_clip": 0.01133068, + "auxiliary_loss_mlp": 0.0110351, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00051284, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.8146936914626948, + "language_loss": 0.88017166, + "learning_rate": 4.118832771491387e-08, + "loss": 0.90253747, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.5422825813293457 + }, + { + "auxiliary_loss_clip": 0.0116485, + "auxiliary_loss_mlp": 0.0074735, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00036693, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.5397954179382776, + "language_loss": 0.78323942, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80236149, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.5210518836975098 + }, + { + "auxiliary_loss_clip": 0.01164829, + "auxiliary_loss_mlp": 0.01101611, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00042593, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.6585417725396057, + "language_loss": 0.77976859, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80243301, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.4701108932495117 + }, + { + "auxiliary_loss_clip": 0.01131144, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.0006547, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.6964070063989976, + "language_loss": 0.71742356, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73977727, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.6094813346862793 + }, + { + "auxiliary_loss_clip": 0.01148669, + "auxiliary_loss_mlp": 0.00747508, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.00048232, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 1.9671776688066824, + "language_loss": 0.53522801, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.5541898, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.596126079559326 + }, + { + "auxiliary_loss_clip": 0.0114866, + "auxiliary_loss_mlp": 0.01102478, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00043511, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.9976342045594053, + "language_loss": 0.67295158, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69546294, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.5428860187530518 + }, + { + "auxiliary_loss_clip": 0.01132987, + "auxiliary_loss_mlp": 0.01102461, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.0003221, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 2.301351088760739, + "language_loss": 0.74163181, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76398629, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 4.017745733261108 + }, + { + "auxiliary_loss_clip": 0.01150043, + "auxiliary_loss_mlp": 0.01102638, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00040388, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.7118188023684762, + "language_loss": 0.73656577, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75909257, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.568002462387085 + }, + { + "auxiliary_loss_clip": 0.01128892, + "auxiliary_loss_mlp": 0.01102848, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00051904, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.795498669075662, + "language_loss": 0.76075739, + "learning_rate": 4.056164175257626e-08, + "loss": 0.7830748, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 3.992069959640503 + }, + { + "auxiliary_loss_clip": 0.01129054, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00050628, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.664150761777694, + "language_loss": 0.78758597, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80990386, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 2.5625948905944824 + }, + { + "auxiliary_loss_clip": 0.01165122, + "auxiliary_loss_mlp": 0.01103422, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00052106, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.4638971997321781, + "language_loss": 0.81306589, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83575135, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.4990234375 + }, + { + "auxiliary_loss_clip": 0.01114951, + "auxiliary_loss_mlp": 0.01104134, + "balance_loss_clip": 1.00163245, + "balance_loss_mlp": 1.00046968, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 1.7957014426846416, + "language_loss": 0.62865865, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65084946, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 4.061065196990967 + }, + { + "auxiliary_loss_clip": 0.01117297, + "auxiliary_loss_mlp": 0.01103296, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.00039434, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 2.0824102289091857, + "language_loss": 0.73373854, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75594449, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.6013073921203613 + }, + { + "auxiliary_loss_clip": 0.01131789, + "auxiliary_loss_mlp": 0.01102947, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00042677, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.885402126481164, + "language_loss": 0.69314522, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71549261, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.571655511856079 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01078949, + "balance_loss_clip": 1.0011481, + "balance_loss_mlp": 0.99998492, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7532794812441691, + "language_loss": 0.5813188, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60356724, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.2925240993499756 + }, + { + "auxiliary_loss_clip": 0.01067624, + "auxiliary_loss_mlp": 0.01103335, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00043309, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 1.9557409668498626, + "language_loss": 0.72219557, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74390519, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 2.738603115081787 + }, + { + "auxiliary_loss_clip": 0.01164609, + "auxiliary_loss_mlp": 0.01101335, + "balance_loss_clip": 1.00173628, + "balance_loss_mlp": 1.00043654, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.5566739226748412, + "language_loss": 0.75932997, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78198946, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.4949748516082764 + }, + { + "auxiliary_loss_clip": 0.01135301, + "auxiliary_loss_mlp": 0.0110356, + "balance_loss_clip": 1.00187433, + "balance_loss_mlp": 1.00046802, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.5008698070464006, + "language_loss": 0.65768075, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.68006939, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.594848871231079 + }, + { + "auxiliary_loss_clip": 0.01116217, + "auxiliary_loss_mlp": 0.00747503, + "balance_loss_clip": 1.00147462, + "balance_loss_mlp": 1.00039315, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 2.2368037575192083, + "language_loss": 0.67688358, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69552076, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 2.7991628646850586 + }, + { + "auxiliary_loss_clip": 0.01148049, + "auxiliary_loss_mlp": 0.01101816, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00044084, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7360209751957396, + "language_loss": 0.77443665, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79693532, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.5133309364318848 + }, + { + "auxiliary_loss_clip": 0.01150349, + "auxiliary_loss_mlp": 0.01103341, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00043988, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 2.0861717181085777, + "language_loss": 0.82611001, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84864694, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.546987295150757 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01103712, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.0005244, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.6152056974633708, + "language_loss": 0.68789113, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.71041328, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.541938543319702 + }, + { + "auxiliary_loss_clip": 0.01116004, + "auxiliary_loss_mlp": 0.01102664, + "balance_loss_clip": 1.00162208, + "balance_loss_mlp": 1.00033498, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.8854432739487539, + "language_loss": 0.75316226, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77534896, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.6237685680389404 + }, + { + "auxiliary_loss_clip": 0.01080275, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00152695, + "balance_loss_mlp": 1.00044394, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 1.6909177676547829, + "language_loss": 0.75046742, + "learning_rate": 3.939942386953987e-08, + "loss": 0.772295, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.645474672317505 + }, + { + "auxiliary_loss_clip": 0.01114657, + "auxiliary_loss_mlp": 0.01102548, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.0004096, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.8005322047562602, + "language_loss": 0.65959346, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68176556, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.581557512283325 + }, + { + "auxiliary_loss_clip": 0.01148263, + "auxiliary_loss_mlp": 0.01101946, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00047517, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.5857393461729035, + "language_loss": 0.56805515, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59055728, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.529071569442749 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.01102704, + "balance_loss_clip": 1.001706, + "balance_loss_mlp": 1.00056529, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.0139556363622724, + "language_loss": 0.70574301, + "learning_rate": 3.916898732330764e-08, + "loss": 0.7280857, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.6300175189971924 + }, + { + "auxiliary_loss_clip": 0.01148438, + "auxiliary_loss_mlp": 0.01103683, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00039983, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 2.0051162773050137, + "language_loss": 0.80950797, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83202922, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.5240793228149414 + }, + { + "auxiliary_loss_clip": 0.01132993, + "auxiliary_loss_mlp": 0.01102916, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00039625, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.6416340662123061, + "language_loss": 0.71786249, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74022162, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.5983450412750244 + }, + { + "auxiliary_loss_clip": 0.01164948, + "auxiliary_loss_mlp": 0.01102549, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00045788, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.8007979862217365, + "language_loss": 0.66108477, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68375981, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.5133774280548096 + }, + { + "auxiliary_loss_clip": 0.01148369, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00042892, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 3.593828600633569, + "language_loss": 0.73778665, + "learning_rate": 3.886277957725092e-08, + "loss": 0.76030368, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.511654853820801 + }, + { + "auxiliary_loss_clip": 0.01165247, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00190532, + "balance_loss_mlp": 1.00041449, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.8734328067702026, + "language_loss": 0.70510221, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72779167, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.4798293113708496 + }, + { + "auxiliary_loss_clip": 0.01135654, + "auxiliary_loss_mlp": 0.01103574, + "balance_loss_clip": 1.00192273, + "balance_loss_mlp": 1.00057673, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.6438022212062253, + "language_loss": 0.77598429, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79837656, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 4.02984094619751 + }, + { + "auxiliary_loss_clip": 0.01148043, + "auxiliary_loss_mlp": 0.01101689, + "balance_loss_clip": 1.00180686, + "balance_loss_mlp": 1.00040925, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 1.8399371208638629, + "language_loss": 0.73755175, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.7600491, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.5068721771240234 + }, + { + "auxiliary_loss_clip": 0.01118871, + "auxiliary_loss_mlp": 0.01103859, + "balance_loss_clip": 1.00176227, + "balance_loss_mlp": 1.00057578, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 2.0631066906246027, + "language_loss": 0.66141671, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68364394, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.5805487632751465 + }, + { + "auxiliary_loss_clip": 0.01135146, + "auxiliary_loss_mlp": 0.01102577, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00053382, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.6415192685141315, + "language_loss": 0.71581566, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73819292, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 2.588960647583008 + }, + { + "auxiliary_loss_clip": 0.01148182, + "auxiliary_loss_mlp": 0.01103664, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.000476, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 2.7586436275472286, + "language_loss": 0.72537798, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74789643, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.6562013626098633 + }, + { + "auxiliary_loss_clip": 0.01116927, + "auxiliary_loss_mlp": 0.01101917, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.0004462, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.9856642321691562, + "language_loss": 0.89384651, + "learning_rate": 3.832977924388614e-08, + "loss": 0.916035, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.625906467437744 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01102405, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00045764, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 2.020592101914764, + "language_loss": 0.83559144, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85809934, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.588066577911377 + }, + { + "auxiliary_loss_clip": 0.01131506, + "auxiliary_loss_mlp": 0.01079385, + "balance_loss_clip": 1.00115013, + "balance_loss_mlp": 1.00003958, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.782808316081888, + "language_loss": 0.56070912, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58281803, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.1204755306243896 + }, + { + "auxiliary_loss_clip": 0.01101863, + "auxiliary_loss_mlp": 0.0110188, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00050449, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.5370686022837234, + "language_loss": 0.69891191, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72094941, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.660095691680908 + }, + { + "auxiliary_loss_clip": 0.0114814, + "auxiliary_loss_mlp": 0.01102554, + "balance_loss_clip": 1.00174916, + "balance_loss_mlp": 1.00046325, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.5640335043765554, + "language_loss": 0.75571221, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.7782191, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.5836422443389893 + }, + { + "auxiliary_loss_clip": 0.0108664, + "auxiliary_loss_mlp": 0.01102368, + "balance_loss_clip": 1.00156236, + "balance_loss_mlp": 1.00041986, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.8031274986615744, + "language_loss": 0.74293053, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76482058, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.668060302734375 + }, + { + "auxiliary_loss_clip": 0.01134046, + "auxiliary_loss_mlp": 0.01101824, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00054359, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 2.0265150559811502, + "language_loss": 0.69639993, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71875858, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 3.9555389881134033 + }, + { + "auxiliary_loss_clip": 0.01100276, + "auxiliary_loss_mlp": 0.01102825, + "balance_loss_clip": 1.00169837, + "balance_loss_mlp": 1.00049567, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 2.0046062442397288, + "language_loss": 0.75124341, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77327442, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.6650309562683105 + }, + { + "auxiliary_loss_clip": 0.01150505, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.00036561, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.495916439515482, + "language_loss": 0.74186468, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76441383, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 4.005968332290649 + }, + { + "auxiliary_loss_clip": 0.01165004, + "auxiliary_loss_mlp": 0.01103646, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00055361, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 2.1477501988909147, + "language_loss": 0.72880816, + "learning_rate": 3.764984908264823e-08, + "loss": 0.75149471, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.537379026412964 + }, + { + "auxiliary_loss_clip": 0.01150272, + "auxiliary_loss_mlp": 0.01103273, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00037122, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.5664059644349348, + "language_loss": 0.69135022, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71388566, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.524637222290039 + }, + { + "auxiliary_loss_clip": 0.01097806, + "auxiliary_loss_mlp": 0.01101744, + "balance_loss_clip": 1.00166726, + "balance_loss_mlp": 1.00046432, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.5419367222858242, + "language_loss": 0.73962533, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76162088, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.6816844940185547 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01103052, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00034177, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.0729262621449434, + "language_loss": 0.83197868, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85449195, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 4.015934944152832 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01102556, + "balance_loss_clip": 1.00170743, + "balance_loss_mlp": 1.00041747, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.0718900741175665, + "language_loss": 0.69390267, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.7159397, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.6533124446868896 + }, + { + "auxiliary_loss_clip": 0.01148034, + "auxiliary_loss_mlp": 0.01101998, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00062215, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.7524908351621733, + "language_loss": 0.8480649, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87056518, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.5784313678741455 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.0074731, + "balance_loss_clip": 1.00167954, + "balance_loss_mlp": 1.00037634, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.4492432219207598, + "language_loss": 0.77976835, + "learning_rate": 3.719991074263662e-08, + "loss": 0.79857576, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.634568214416504 + }, + { + "auxiliary_loss_clip": 0.01148205, + "auxiliary_loss_mlp": 0.01103044, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.0004288, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.7262715287599342, + "language_loss": 0.74159992, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76411241, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.6152052879333496 + }, + { + "auxiliary_loss_clip": 0.01148488, + "auxiliary_loss_mlp": 0.01104697, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00046027, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 2.0937290792257985, + "language_loss": 0.82093745, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84346932, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.5106871128082275 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01102882, + "balance_loss_clip": 1.00169325, + "balance_loss_mlp": 1.00045717, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.4040624145679406, + "language_loss": 0.68260038, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70511305, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 2.5719754695892334 + }, + { + "auxiliary_loss_clip": 0.01150423, + "auxiliary_loss_mlp": 0.01104483, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00062835, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.8590600058865319, + "language_loss": 0.7684297, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79097879, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.53177547454834 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.01102555, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.000512, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.6537702146115552, + "language_loss": 0.67428792, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69681889, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.5455267429351807 + }, + { + "auxiliary_loss_clip": 0.01130539, + "auxiliary_loss_mlp": 0.00747317, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00029802, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.7640529964269616, + "language_loss": 0.70489079, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72366941, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.6300065517425537 + }, + { + "auxiliary_loss_clip": 0.01150249, + "auxiliary_loss_mlp": 0.0110277, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00044036, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.5413433456389485, + "language_loss": 0.7409606, + "learning_rate": 3.667836926755208e-08, + "loss": 0.7634908, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.5550200939178467 + }, + { + "auxiliary_loss_clip": 0.01127504, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_clip": 1.00114977, + "balance_loss_mlp": 0.99999923, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8831289380908851, + "language_loss": 0.63560033, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65766501, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.267106533050537 + }, + { + "auxiliary_loss_clip": 0.0116484, + "auxiliary_loss_mlp": 0.01102166, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00050473, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.4067658602903945, + "language_loss": 0.66267556, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68534559, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.5329532623291016 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.01102661, + "balance_loss_clip": 1.00162888, + "balance_loss_mlp": 1.00033176, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 1.8442603204457977, + "language_loss": 0.7784093, + "learning_rate": 3.645596817637586e-08, + "loss": 0.80061924, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.597339391708374 + }, + { + "auxiliary_loss_clip": 0.01101385, + "auxiliary_loss_mlp": 0.01102677, + "balance_loss_clip": 1.00173581, + "balance_loss_mlp": 1.0004431, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.5889742320608236, + "language_loss": 0.7460382, + "learning_rate": 3.638198339114451e-08, + "loss": 0.7680788, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.6781816482543945 + }, + { + "auxiliary_loss_clip": 0.01164746, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00047171, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.9680476297825735, + "language_loss": 0.72379112, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74646664, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.4730184078216553 + }, + { + "auxiliary_loss_clip": 0.01117265, + "auxiliary_loss_mlp": 0.01104836, + "balance_loss_clip": 1.00174034, + "balance_loss_mlp": 1.00050402, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.5901797089366787, + "language_loss": 0.66464156, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68686259, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.641711711883545 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.0110368, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00039685, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 1.7152707538677143, + "language_loss": 0.77886552, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.8015523, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.50578236579895 + }, + { + "auxiliary_loss_clip": 0.01150334, + "auxiliary_loss_mlp": 0.01103926, + "balance_loss_clip": 1.00173104, + "balance_loss_mlp": 1.0004518, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.458216726589818, + "language_loss": 0.70201266, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72455519, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.68349027633667 + }, + { + "auxiliary_loss_clip": 0.01164986, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.0019058, + "balance_loss_mlp": 1.00048554, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1.7480689751775789, + "language_loss": 0.71994305, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74262393, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.4781041145324707 + }, + { + "auxiliary_loss_clip": 0.01130814, + "auxiliary_loss_mlp": 0.01102527, + "balance_loss_clip": 1.00198925, + "balance_loss_mlp": 1.00038862, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 2.160389707241886, + "language_loss": 0.77964133, + "learning_rate": 3.593963845018377e-08, + "loss": 0.80197477, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.6032562255859375 + }, + { + "auxiliary_loss_clip": 0.01118112, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_clip": 1.00164104, + "balance_loss_mlp": 1.00030291, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 1.968056315427301, + "language_loss": 0.84013391, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86234033, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 3.997162342071533 + }, + { + "auxiliary_loss_clip": 0.01165097, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.718165521566674, + "language_loss": 0.70180565, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72449678, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 2.4690492153167725 + }, + { + "auxiliary_loss_clip": 0.01131703, + "auxiliary_loss_mlp": 0.01102831, + "balance_loss_clip": 1.00158358, + "balance_loss_mlp": 1.00050175, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.9813648279677485, + "language_loss": 0.797167, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81951237, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.603572130203247 + }, + { + "auxiliary_loss_clip": 0.0111616, + "auxiliary_loss_mlp": 0.01101377, + "balance_loss_clip": 1.0016011, + "balance_loss_mlp": 1.00038326, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4705616223803724, + "language_loss": 0.68076319, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70293856, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.01150155, + "auxiliary_loss_mlp": 0.01101663, + "balance_loss_clip": 1.00176728, + "balance_loss_mlp": 1.00038278, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.1995220414063015, + "language_loss": 0.66263235, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68515056, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 2.5066940784454346 + }, + { + "auxiliary_loss_clip": 0.01127259, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_clip": 1.00114548, + "balance_loss_mlp": 1.0000546, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7706997778628102, + "language_loss": 0.59329665, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61536324, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.2279789447784424 + }, + { + "auxiliary_loss_clip": 0.01150358, + "auxiliary_loss_mlp": 0.01104331, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00047541, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 2.2632953826657163, + "language_loss": 0.66727328, + "learning_rate": 3.542695811435914e-08, + "loss": 0.68982011, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.6418869495391846 + }, + { + "auxiliary_loss_clip": 0.01131318, + "auxiliary_loss_mlp": 0.01103169, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.0003624, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 1.959704768798282, + "language_loss": 0.73490351, + "learning_rate": 3.535401603143207e-08, + "loss": 0.7572484, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.555474281311035 + }, + { + "auxiliary_loss_clip": 0.01164995, + "auxiliary_loss_mlp": 0.0110229, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00043809, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 1.980202892531752, + "language_loss": 0.64086384, + "learning_rate": 3.528114844807773e-08, + "loss": 0.66353667, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.457253932952881 + }, + { + "auxiliary_loss_clip": 0.01117138, + "auxiliary_loss_mlp": 0.01102903, + "balance_loss_clip": 1.00154579, + "balance_loss_mlp": 1.00057411, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.7121733559344554, + "language_loss": 0.79113668, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81333715, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.5818235874176025 + }, + { + "auxiliary_loss_clip": 0.01164731, + "auxiliary_loss_mlp": 0.01102082, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00046778, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.5618659120348388, + "language_loss": 0.75462019, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77728838, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.496633291244507 + }, + { + "auxiliary_loss_clip": 0.01095562, + "auxiliary_loss_mlp": 0.01102715, + "balance_loss_clip": 1.00160241, + "balance_loss_mlp": 1.00048113, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.1394191590064704, + "language_loss": 0.59090412, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61288691, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 4.016417026519775 + }, + { + "auxiliary_loss_clip": 0.0111824, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00040185, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5394621400069786, + "language_loss": 0.76917607, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79137814, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.602698802947998 + }, + { + "auxiliary_loss_clip": 0.01164969, + "auxiliary_loss_mlp": 0.01103152, + "balance_loss_clip": 1.00194001, + "balance_loss_mlp": 1.00053668, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.7242312969672189, + "language_loss": 0.65269309, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67537427, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 4.030287265777588 + }, + { + "auxiliary_loss_clip": 0.01133532, + "auxiliary_loss_mlp": 0.01102468, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00042534, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.792860813946151, + "language_loss": 0.79569441, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81805444, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.5739262104034424 + }, + { + "auxiliary_loss_clip": 0.0111824, + "auxiliary_loss_mlp": 0.01103642, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00045466, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.113216800599858, + "language_loss": 0.73261672, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75483555, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.5969133377075195 + }, + { + "auxiliary_loss_clip": 0.0115046, + "auxiliary_loss_mlp": 0.01102237, + "balance_loss_clip": 1.00192606, + "balance_loss_mlp": 1.00047994, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.4722368552345266, + "language_loss": 0.70300984, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72553682, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.609351873397827 + }, + { + "auxiliary_loss_clip": 0.0116489, + "auxiliary_loss_mlp": 0.01103071, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00036001, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.6884889314193852, + "language_loss": 0.81218302, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83486265, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 3.9276819229125977 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.01102944, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00042439, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.7483238786549078, + "language_loss": 0.62563848, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.6480031, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.5862343311309814 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01102882, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.00055289, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.6416569311547784, + "language_loss": 0.66807914, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69045448, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.582780599594116 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.0110374, + "balance_loss_clip": 1.00172675, + "balance_loss_mlp": 1.00045681, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.9862315105927073, + "language_loss": 0.64278769, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66499448, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.644688844680786 + }, + { + "auxiliary_loss_clip": 0.01118309, + "auxiliary_loss_mlp": 0.0110313, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00051498, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.4897349910021493, + "language_loss": 0.74279135, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76500583, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 2.6276094913482666 + }, + { + "auxiliary_loss_clip": 0.0113312, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00060511, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.4664098944576867, + "language_loss": 0.77667844, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79904282, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.5507330894470215 + }, + { + "auxiliary_loss_clip": 0.01148581, + "auxiliary_loss_mlp": 0.01102262, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00050509, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 2.0912720308626898, + "language_loss": 0.75645399, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77896243, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 2.5374388694763184 + }, + { + "auxiliary_loss_clip": 0.01129058, + "auxiliary_loss_mlp": 0.0110284, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.00051093, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 2.8216513778204444, + "language_loss": 0.65783054, + "learning_rate": 3.412540130236086e-08, + "loss": 0.68014956, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 2.5894582271575928 + }, + { + "auxiliary_loss_clip": 0.01115689, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_clip": 1.00156999, + "balance_loss_mlp": 1.0003916, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 1.657641557756681, + "language_loss": 0.76648355, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78866088, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.63335919380188 + }, + { + "auxiliary_loss_clip": 0.01150514, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00060439, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 3.6401233261134767, + "language_loss": 0.75832731, + "learning_rate": 3.398227451090885e-08, + "loss": 0.78086936, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.5064785480499268 + }, + { + "auxiliary_loss_clip": 0.01164958, + "auxiliary_loss_mlp": 0.01102044, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00028706, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.6240655696623998, + "language_loss": 0.76997334, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79264331, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.5337798595428467 + }, + { + "auxiliary_loss_clip": 0.01147593, + "auxiliary_loss_mlp": 0.01102033, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00046718, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.755811563239593, + "language_loss": 0.75338709, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77588332, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.559936761856079 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.01103394, + "balance_loss_clip": 1.00173759, + "balance_loss_mlp": 1.00049233, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.6974484581363682, + "language_loss": 0.80488533, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82740092, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.546485185623169 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.0110338, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00047827, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 4.4405203682041305, + "language_loss": 0.75962281, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78197193, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.5411877632141113 + }, + { + "auxiliary_loss_clip": 0.01133422, + "auxiliary_loss_mlp": 0.01102227, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00037503, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 4.028133766401109, + "language_loss": 0.68291724, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70527375, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.62721586227417 + }, + { + "auxiliary_loss_clip": 0.01150153, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00057483, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 2.3511798599976, + "language_loss": 0.80846238, + "learning_rate": 3.35546834612872e-08, + "loss": 0.83098149, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.529193162918091 + }, + { + "auxiliary_loss_clip": 0.01148574, + "auxiliary_loss_mlp": 0.01102636, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00049758, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.776613215429919, + "language_loss": 0.60150361, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62401569, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.629908323287964 + }, + { + "auxiliary_loss_clip": 0.01120341, + "auxiliary_loss_mlp": 0.01102949, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00042939, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.4519693630345873, + "language_loss": 0.66615444, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68838739, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.602677583694458 + }, + { + "auxiliary_loss_clip": 0.01148523, + "auxiliary_loss_mlp": 0.01103023, + "balance_loss_clip": 1.00168788, + "balance_loss_mlp": 1.00050259, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.2592209985004246, + "language_loss": 0.74734879, + "learning_rate": 3.334189456537251e-08, + "loss": 0.76986426, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.520782232284546 + }, + { + "auxiliary_loss_clip": 0.01119812, + "auxiliary_loss_mlp": 0.01102919, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00049448, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 2.0770949353637467, + "language_loss": 0.7314899, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75371718, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.641732692718506 + }, + { + "auxiliary_loss_clip": 0.01110519, + "auxiliary_loss_mlp": 0.01079368, + "balance_loss_clip": 1.00129604, + "balance_loss_mlp": 1.00002241, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.8244715625007447, + "language_loss": 0.50611579, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52801466, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.2352395057678223 + }, + { + "auxiliary_loss_clip": 0.01135548, + "auxiliary_loss_mlp": 0.0110213, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00056338, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.7123970203863648, + "language_loss": 0.6542064, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67658317, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 2.5846078395843506 + }, + { + "auxiliary_loss_clip": 0.01148284, + "auxiliary_loss_mlp": 0.01103069, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00035787, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.54150792353251, + "language_loss": 0.66352886, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68604237, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 4.053234338760376 + }, + { + "auxiliary_loss_clip": 0.01114204, + "auxiliary_loss_mlp": 0.01079354, + "balance_loss_clip": 1.00112271, + "balance_loss_mlp": 1.00000823, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8550923052669822, + "language_loss": 0.63193238, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65386796, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.09226393699646 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01103358, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00055146, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.764542989404932, + "language_loss": 0.69573998, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71809012, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.5851831436157227 + }, + { + "auxiliary_loss_clip": 0.01118615, + "auxiliary_loss_mlp": 0.01102644, + "balance_loss_clip": 1.00165963, + "balance_loss_mlp": 1.00041044, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 1.9050177195649522, + "language_loss": 0.74479705, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76700962, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.5959365367889404 + }, + { + "auxiliary_loss_clip": 0.01067678, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_clip": 1.00166965, + "balance_loss_mlp": 1.0004189, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.916874853858392, + "language_loss": 0.70263499, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72433829, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.71561598777771 + }, + { + "auxiliary_loss_clip": 0.01100678, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.0015049, + "balance_loss_mlp": 1.00039554, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 1.771648428980947, + "language_loss": 0.77511889, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79715866, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 2.6621055603027344 + }, + { + "auxiliary_loss_clip": 0.01150183, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.00195062, + "balance_loss_mlp": 1.00046659, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.7369308745232945, + "language_loss": 0.66810572, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.69064695, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.5315144062042236 + }, + { + "auxiliary_loss_clip": 0.01148464, + "auxiliary_loss_mlp": 0.01103833, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00045466, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.5885692048430557, + "language_loss": 0.73232055, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75484359, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.6075830459594727 + }, + { + "auxiliary_loss_clip": 0.0114814, + "auxiliary_loss_mlp": 0.01102895, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.0004704, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 7.644542873067595, + "language_loss": 0.7433092, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76581955, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.5331876277923584 + }, + { + "auxiliary_loss_clip": 0.01133069, + "auxiliary_loss_mlp": 0.01102605, + "balance_loss_clip": 1.0018872, + "balance_loss_mlp": 1.00056159, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.9177667645944902, + "language_loss": 0.77054781, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.7929045, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 3.944361686706543 + }, + { + "auxiliary_loss_clip": 0.01148007, + "auxiliary_loss_mlp": 0.01101192, + "balance_loss_clip": 1.00169921, + "balance_loss_mlp": 1.00029325, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 2.4912868572573146, + "language_loss": 0.69504595, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71753794, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.5398004055023193 + }, + { + "auxiliary_loss_clip": 0.01164809, + "auxiliary_loss_mlp": 0.01102628, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00039434, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.9535466009812357, + "language_loss": 0.69421577, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71689016, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 2.491342544555664 + }, + { + "auxiliary_loss_clip": 0.01150338, + "auxiliary_loss_mlp": 0.0110265, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.00051188, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.2819233704395026, + "language_loss": 0.7028659, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72539574, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 3.9193055629730225 + }, + { + "auxiliary_loss_clip": 0.01101919, + "auxiliary_loss_mlp": 0.01101933, + "balance_loss_clip": 1.00158691, + "balance_loss_mlp": 1.0005579, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 1.8496383163322785, + "language_loss": 0.85013157, + "learning_rate": 3.214877084074774e-08, + "loss": 0.87217009, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.6339054107666016 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.01103061, + "balance_loss_clip": 1.00165677, + "balance_loss_mlp": 1.00054073, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.5856084861681212, + "language_loss": 0.71327579, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73549724, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.616619348526001 + }, + { + "auxiliary_loss_clip": 0.01150391, + "auxiliary_loss_mlp": 0.01103721, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.8096335103445111, + "language_loss": 0.69627738, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71881849, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 3.9860377311706543 + }, + { + "auxiliary_loss_clip": 0.0114853, + "auxiliary_loss_mlp": 0.01103834, + "balance_loss_clip": 1.00188386, + "balance_loss_mlp": 1.00055075, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.0163313765653292, + "language_loss": 0.70858675, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73111033, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.519326686859131 + }, + { + "auxiliary_loss_clip": 0.0113339, + "auxiliary_loss_mlp": 0.011022, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00044298, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.4969971289215922, + "language_loss": 0.76642346, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78877938, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.64797043800354 + }, + { + "auxiliary_loss_clip": 0.01117875, + "auxiliary_loss_mlp": 0.01102825, + "balance_loss_clip": 1.00165224, + "balance_loss_mlp": 1.00040078, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.822324553076772, + "language_loss": 0.68360686, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.70581383, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.626469135284424 + }, + { + "auxiliary_loss_clip": 0.01114471, + "auxiliary_loss_mlp": 0.0110294, + "balance_loss_clip": 1.00165486, + "balance_loss_mlp": 1.00051522, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 1.7240074290977023, + "language_loss": 0.74403059, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76620477, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.637065887451172 + }, + { + "auxiliary_loss_clip": 0.01132658, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00047731, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.5834419356117466, + "language_loss": 0.62219954, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64455414, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.01148596, + "auxiliary_loss_mlp": 0.01103229, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00051796, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.771676216494908, + "language_loss": 0.79159445, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81411266, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.5021204948425293 + }, + { + "auxiliary_loss_clip": 0.01143943, + "auxiliary_loss_mlp": 0.01079019, + "balance_loss_clip": 1.00116205, + "balance_loss_mlp": 1.00005519, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.7037710705380676, + "language_loss": 0.57756495, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59979463, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.147442579269409 + }, + { + "auxiliary_loss_clip": 0.01104176, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00046527, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.57231144577876, + "language_loss": 0.75772804, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77624398, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.6887619495391846 + }, + { + "auxiliary_loss_clip": 0.01147973, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_clip": 1.00170648, + "balance_loss_mlp": 1.00049114, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.6818357903704249, + "language_loss": 0.72761559, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75011396, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 2.600287914276123 + }, + { + "auxiliary_loss_clip": 0.01103517, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00158787, + "balance_loss_mlp": 1.00046015, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 5.627143537552774, + "language_loss": 0.84991527, + "learning_rate": 3.131954915863244e-08, + "loss": 0.8719812, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.670746326446533 + }, + { + "auxiliary_loss_clip": 0.01126506, + "auxiliary_loss_mlp": 0.01078986, + "balance_loss_clip": 1.00104737, + "balance_loss_mlp": 1.00002134, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8983315935923197, + "language_loss": 0.64533973, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66739464, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.064007043838501 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01104162, + "balance_loss_clip": 1.001652, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.0959063483470004, + "language_loss": 0.73411167, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75632608, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.5739612579345703 + }, + { + "auxiliary_loss_clip": 0.01116574, + "auxiliary_loss_mlp": 0.01102505, + "balance_loss_clip": 1.00165701, + "balance_loss_mlp": 1.00046206, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.1986990121228667, + "language_loss": 0.84869778, + "learning_rate": 3.111392324436024e-08, + "loss": 0.87088859, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.6220128536224365 + }, + { + "auxiliary_loss_clip": 0.01133032, + "auxiliary_loss_mlp": 0.01102778, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 2.0325351876683193, + "language_loss": 0.70901072, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73136884, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 2.5911970138549805 + }, + { + "auxiliary_loss_clip": 0.01133679, + "auxiliary_loss_mlp": 0.01102944, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00051987, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.6873302126737497, + "language_loss": 0.60915691, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63152313, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.582254648208618 + }, + { + "auxiliary_loss_clip": 0.01150079, + "auxiliary_loss_mlp": 0.01101565, + "balance_loss_clip": 1.00182807, + "balance_loss_mlp": 1.0003798, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.6658824672947745, + "language_loss": 0.81553507, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83805156, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.513859272003174 + }, + { + "auxiliary_loss_clip": 0.01083481, + "auxiliary_loss_mlp": 0.01079713, + "balance_loss_clip": 1.00120282, + "balance_loss_mlp": 0.99998552, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7273046050260049, + "language_loss": 0.59083879, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61247075, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.2113261222839355 + }, + { + "auxiliary_loss_clip": 0.01164828, + "auxiliary_loss_mlp": 0.01101688, + "balance_loss_clip": 1.00177324, + "balance_loss_mlp": 1.0004077, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.7809648135242693, + "language_loss": 0.76471204, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78737718, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 2.4852805137634277 + }, + { + "auxiliary_loss_clip": 0.0112041, + "auxiliary_loss_mlp": 0.01103301, + "balance_loss_clip": 1.0017252, + "balance_loss_mlp": 1.00049472, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4932104145766358, + "language_loss": 0.62492204, + "learning_rate": 3.070468731536047e-08, + "loss": 0.6471591, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.6477088928222656 + }, + { + "auxiliary_loss_clip": 0.01149737, + "auxiliary_loss_mlp": 0.01103092, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00038171, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 2.060262419401107, + "language_loss": 0.63813865, + "learning_rate": 3.063674267769589e-08, + "loss": 0.660667, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.587031364440918 + }, + { + "auxiliary_loss_clip": 0.0114874, + "auxiliary_loss_mlp": 0.011045, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00035846, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 2.035709252496313, + "language_loss": 0.84160149, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86413389, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.5140397548675537 + }, + { + "auxiliary_loss_clip": 0.01150159, + "auxiliary_loss_mlp": 0.01101967, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00059152, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.4394330748491886, + "language_loss": 0.72265601, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74517733, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 3.977041721343994 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01101125, + "balance_loss_clip": 1.00165129, + "balance_loss_mlp": 1.00046444, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 4.543229578117206, + "language_loss": 0.86856186, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89105183, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.577627420425415 + }, + { + "auxiliary_loss_clip": 0.01134261, + "auxiliary_loss_mlp": 0.01102846, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00042129, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 2.1441616086089734, + "language_loss": 0.67055047, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69292152, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.5868310928344727 + }, + { + "auxiliary_loss_clip": 0.01097203, + "auxiliary_loss_mlp": 0.01079385, + "balance_loss_clip": 1.00139952, + "balance_loss_mlp": 1.0000391, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8655000140409812, + "language_loss": 0.65283984, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67460573, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.2405660152435303 + }, + { + "auxiliary_loss_clip": 0.01143314, + "auxiliary_loss_mlp": 0.01079334, + "balance_loss_clip": 1.00120473, + "balance_loss_mlp": 0.9999879, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.798341575576351, + "language_loss": 0.58811963, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.61034608, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.1056711673736572 + }, + { + "auxiliary_loss_clip": 0.01150082, + "auxiliary_loss_mlp": 0.01102272, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00061035, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.5794304489879631, + "language_loss": 0.71576083, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73828435, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.5789618492126465 + }, + { + "auxiliary_loss_clip": 0.01150283, + "auxiliary_loss_mlp": 0.01103178, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.3752311159149575, + "language_loss": 0.63828242, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66081703, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.574373722076416 + }, + { + "auxiliary_loss_clip": 0.01131334, + "auxiliary_loss_mlp": 0.01102236, + "balance_loss_clip": 1.00175309, + "balance_loss_mlp": 1.00047898, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.844939765807165, + "language_loss": 0.66312009, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.6854558, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.6144824028015137 + }, + { + "auxiliary_loss_clip": 0.01148195, + "auxiliary_loss_mlp": 0.01102818, + "balance_loss_clip": 1.00179398, + "balance_loss_mlp": 1.00039315, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 1.8933599493625652, + "language_loss": 0.75982177, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.78233182, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.508728504180908 + }, + { + "auxiliary_loss_clip": 0.01150249, + "auxiliary_loss_mlp": 0.01101712, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00043142, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.8690779819612717, + "language_loss": 0.72383791, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74635756, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.523909568786621 + }, + { + "auxiliary_loss_clip": 0.01117223, + "auxiliary_loss_mlp": 0.01103975, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00050175, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 1.7152749653513482, + "language_loss": 0.79778308, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81999505, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 4.110236644744873 + }, + { + "auxiliary_loss_clip": 0.0113156, + "auxiliary_loss_mlp": 0.01102605, + "balance_loss_clip": 1.00179207, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.4838171696469649, + "language_loss": 0.78114319, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.8034848, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.6034350395202637 + }, + { + "auxiliary_loss_clip": 0.01133512, + "auxiliary_loss_mlp": 0.01103218, + "balance_loss_clip": 1.00167203, + "balance_loss_mlp": 1.00050759, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.3941697001000324, + "language_loss": 0.70145553, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72382283, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.581416606903076 + }, + { + "auxiliary_loss_clip": 0.01132175, + "auxiliary_loss_mlp": 0.01102951, + "balance_loss_clip": 1.001683, + "balance_loss_mlp": 1.00043106, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.666956978095869, + "language_loss": 0.563317, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58566833, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 3.9430489540100098 + }, + { + "auxiliary_loss_clip": 0.01068553, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_clip": 1.0011425, + "balance_loss_mlp": 0.99999887, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.65355069459544, + "language_loss": 0.53261143, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55408657, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.48046612739563 + }, + { + "auxiliary_loss_clip": 0.01133525, + "auxiliary_loss_mlp": 0.01102584, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00044513, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.58335658065405, + "language_loss": 0.66183174, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68419284, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 2.6278138160705566 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01103046, + "balance_loss_clip": 1.00170135, + "balance_loss_mlp": 1.00043035, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.7850848270419473, + "language_loss": 0.76289392, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78510737, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 4.095236301422119 + }, + { + "auxiliary_loss_clip": 0.01148223, + "auxiliary_loss_mlp": 0.01102569, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00033498, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.9233074334560685, + "language_loss": 0.68042219, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70293015, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.551607131958008 + }, + { + "auxiliary_loss_clip": 0.0111863, + "auxiliary_loss_mlp": 0.01103107, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.00049174, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.5244946338143168, + "language_loss": 0.6545589, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67677629, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 2.642416000366211 + }, + { + "auxiliary_loss_clip": 0.01149896, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00037384, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.5251765054134077, + "language_loss": 0.7157796, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73830748, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.5572502613067627 + }, + { + "auxiliary_loss_clip": 0.0116499, + "auxiliary_loss_mlp": 0.01103313, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00050652, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.9984564763186665, + "language_loss": 0.69881487, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72149795, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 2.517819404602051 + }, + { + "auxiliary_loss_clip": 0.01164971, + "auxiliary_loss_mlp": 0.01103585, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00049281, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.8950936460899257, + "language_loss": 0.78665406, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80933964, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 2.4789860248565674 + }, + { + "auxiliary_loss_clip": 0.01117504, + "auxiliary_loss_mlp": 0.01104635, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00058913, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.4762869571357586, + "language_loss": 0.75706756, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77928889, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.5973050594329834 + }, + { + "auxiliary_loss_clip": 0.01135684, + "auxiliary_loss_mlp": 0.01102149, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00048733, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.0434124825068034, + "language_loss": 0.75000668, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.772385, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 2.5493052005767822 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.0110299, + "balance_loss_clip": 1.00174177, + "balance_loss_mlp": 1.00046992, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.6546306182309736, + "language_loss": 0.79712522, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81948662, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 2.591531753540039 + }, + { + "auxiliary_loss_clip": 0.01130836, + "auxiliary_loss_mlp": 0.00747376, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00035083, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.404568584791507, + "language_loss": 0.72127098, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74005312, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.6342215538024902 + }, + { + "auxiliary_loss_clip": 0.01148182, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00046873, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.468768926492944, + "language_loss": 0.75595522, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77846026, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.5635781288146973 + }, + { + "auxiliary_loss_clip": 0.0116499, + "auxiliary_loss_mlp": 0.00747366, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.0003686, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.7283721459940526, + "language_loss": 0.72784728, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74697083, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.4872355461120605 + }, + { + "auxiliary_loss_clip": 0.01131508, + "auxiliary_loss_mlp": 0.01102318, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00056076, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 1.9986075527444946, + "language_loss": 0.71799707, + "learning_rate": 2.863314050734722e-08, + "loss": 0.74033535, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.54024600982666 + }, + { + "auxiliary_loss_clip": 0.01165128, + "auxiliary_loss_mlp": 0.01104355, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.856208369787972, + "language_loss": 0.6698643, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69255912, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.4868829250335693 + }, + { + "auxiliary_loss_clip": 0.01164899, + "auxiliary_loss_mlp": 0.01102926, + "balance_loss_clip": 1.00173783, + "balance_loss_mlp": 1.00050187, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.7084833789720968, + "language_loss": 0.70348251, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.72616082, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.512202024459839 + }, + { + "auxiliary_loss_clip": 0.01149692, + "auxiliary_loss_mlp": 0.00747165, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00028121, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.5492541730185856, + "language_loss": 0.71165013, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73061872, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.575547695159912 + }, + { + "auxiliary_loss_clip": 0.01139577, + "auxiliary_loss_mlp": 0.01079357, + "balance_loss_clip": 1.00122225, + "balance_loss_mlp": 1.00001132, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8068378479179407, + "language_loss": 0.59020668, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.612396, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.896286964416504 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.01102063, + "balance_loss_clip": 1.00175965, + "balance_loss_mlp": 1.00059164, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.7485931696194843, + "language_loss": 0.74455386, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76657248, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.625128984451294 + }, + { + "auxiliary_loss_clip": 0.01118021, + "auxiliary_loss_mlp": 0.01103455, + "balance_loss_clip": 1.00178373, + "balance_loss_mlp": 1.00045812, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 1.9017894413714973, + "language_loss": 0.73312664, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75534141, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.6142475605010986 + }, + { + "auxiliary_loss_clip": 0.01095864, + "auxiliary_loss_mlp": 0.01079326, + "balance_loss_clip": 1.00092506, + "balance_loss_mlp": 0.99998009, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7371282996996527, + "language_loss": 0.55296135, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57471329, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.2020277976989746 + }, + { + "auxiliary_loss_clip": 0.01099766, + "auxiliary_loss_mlp": 0.01102263, + "balance_loss_clip": 1.00171232, + "balance_loss_mlp": 1.00041091, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.3342731271285468, + "language_loss": 0.77504718, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.7970674, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.7119500637054443 + }, + { + "auxiliary_loss_clip": 0.01131995, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00059402, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 2.8345756852289874, + "language_loss": 0.79745555, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.81980097, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 4.037825345993042 + }, + { + "auxiliary_loss_clip": 0.01118182, + "auxiliary_loss_mlp": 0.01101774, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00039816, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.9029853709716102, + "language_loss": 0.69663501, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71883458, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.606604814529419 + }, + { + "auxiliary_loss_clip": 0.01150535, + "auxiliary_loss_mlp": 0.01102464, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00037372, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.5394674866110496, + "language_loss": 0.73823071, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76076066, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.5396838188171387 + }, + { + "auxiliary_loss_clip": 0.01116851, + "auxiliary_loss_mlp": 0.01102675, + "balance_loss_clip": 1.00169086, + "balance_loss_mlp": 1.00044131, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 1.9899122037675618, + "language_loss": 0.63034248, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.65253782, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 2.60310435295105 + }, + { + "auxiliary_loss_clip": 0.01164938, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00050843, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 1.5944159908615319, + "language_loss": 0.59255701, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61523855, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.4903249740600586 + }, + { + "auxiliary_loss_clip": 0.01131528, + "auxiliary_loss_mlp": 0.01102443, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00039959, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.5347243041294358, + "language_loss": 0.62099391, + "learning_rate": 2.772114638584555e-08, + "loss": 0.64333361, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.7483322620391846 + }, + { + "auxiliary_loss_clip": 0.01133814, + "auxiliary_loss_mlp": 0.01103015, + "balance_loss_clip": 1.00164688, + "balance_loss_mlp": 1.00049472, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 2.337814611943649, + "language_loss": 0.73907351, + "learning_rate": 2.765656478622458e-08, + "loss": 0.76144177, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.01150218, + "auxiliary_loss_mlp": 0.01104965, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00053728, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 3.710033572343596, + "language_loss": 0.7196334, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74218524, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.573863983154297 + }, + { + "auxiliary_loss_clip": 0.01149285, + "auxiliary_loss_mlp": 0.0074727, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00035048, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.8193428336517274, + "language_loss": 0.69965887, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71862441, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.543184757232666 + }, + { + "auxiliary_loss_clip": 0.01165012, + "auxiliary_loss_mlp": 0.01102623, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00038862, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 1.9699557495103421, + "language_loss": 0.78438854, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80706489, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.477299928665161 + }, + { + "auxiliary_loss_clip": 0.01131932, + "auxiliary_loss_mlp": 0.00747346, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00034177, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.6782503396774142, + "language_loss": 0.6613028, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68009567, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.6118593215942383 + }, + { + "auxiliary_loss_clip": 0.01164811, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.4606069397890065, + "language_loss": 0.79786569, + "learning_rate": 2.733477870890999e-08, + "loss": 0.82053775, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 3.925855875015259 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01079498, + "balance_loss_clip": 1.00120378, + "balance_loss_mlp": 1.00015187, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7182097270915162, + "language_loss": 0.59805274, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.62029171, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.2175912857055664 + }, + { + "auxiliary_loss_clip": 0.01148203, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_clip": 1.00164604, + "balance_loss_mlp": 1.00054264, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6205374133888875, + "language_loss": 0.74183333, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76434886, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.573110342025757 + }, + { + "auxiliary_loss_clip": 0.01104238, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00049126, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.8519306988298978, + "language_loss": 0.69927001, + "learning_rate": 2.714260468695806e-08, + "loss": 0.72134829, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 4.036684274673462 + }, + { + "auxiliary_loss_clip": 0.01165015, + "auxiliary_loss_mlp": 0.01103204, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00049353, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.6064973851992288, + "language_loss": 0.76171738, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78439957, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.5156242847442627 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.01102075, + "balance_loss_clip": 1.00163174, + "balance_loss_mlp": 1.00041318, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 2.1167736048330164, + "language_loss": 0.79254901, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81471545, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 4.014052629470825 + }, + { + "auxiliary_loss_clip": 0.01149346, + "auxiliary_loss_mlp": 0.0110272, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00048661, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.4523260962330204, + "language_loss": 0.76629251, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78881317, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.5529935359954834 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01103514, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00042212, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 1.6759963807170515, + "language_loss": 0.71613747, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73865473, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.5531511306762695 + }, + { + "auxiliary_loss_clip": 0.01114445, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.0004853, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.6663971819321655, + "language_loss": 0.72981298, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75198841, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.5842177867889404 + }, + { + "auxiliary_loss_clip": 0.01116953, + "auxiliary_loss_mlp": 0.01103623, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00043607, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.708930511464955, + "language_loss": 0.77574366, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79794943, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 2.617842435836792 + }, + { + "auxiliary_loss_clip": 0.01148342, + "auxiliary_loss_mlp": 0.01103304, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 1.9210835728675486, + "language_loss": 0.73386866, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75638509, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.587702989578247 + }, + { + "auxiliary_loss_clip": 0.01150137, + "auxiliary_loss_mlp": 0.01102476, + "balance_loss_clip": 1.00182283, + "balance_loss_mlp": 1.00043273, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.8617626705480332, + "language_loss": 0.78215164, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80467772, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.5182945728302 + }, + { + "auxiliary_loss_clip": 0.01133553, + "auxiliary_loss_mlp": 0.01102403, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00045514, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.7976405956251278, + "language_loss": 0.77293772, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79529727, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.592406988143921 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.00747464, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.00047433, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.8686341167021887, + "language_loss": 0.60965997, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62829828, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 2.5925960540771484 + }, + { + "auxiliary_loss_clip": 0.011483, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00049567, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6017324429077482, + "language_loss": 0.79085743, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81336683, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.584951877593994 + }, + { + "auxiliary_loss_clip": 0.01148147, + "auxiliary_loss_mlp": 0.01103694, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00041068, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 2.1343629806487177, + "language_loss": 0.76033181, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.78285021, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 2.5316824913024902 + }, + { + "auxiliary_loss_clip": 0.01118735, + "auxiliary_loss_mlp": 0.0074745, + "balance_loss_clip": 1.00174606, + "balance_loss_mlp": 1.00042129, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 7.446903260648522, + "language_loss": 0.65820861, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67687047, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 2.6396801471710205 + }, + { + "auxiliary_loss_clip": 0.01148383, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.00189531, + "balance_loss_mlp": 1.00054252, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.9357999086673876, + "language_loss": 0.77143502, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79394752, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.545447826385498 + }, + { + "auxiliary_loss_clip": 0.01150264, + "auxiliary_loss_mlp": 0.01102582, + "balance_loss_clip": 1.00198495, + "balance_loss_mlp": 1.00034785, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 6.664295456572259, + "language_loss": 0.71098506, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73351353, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 2.545854330062866 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.01102166, + "balance_loss_clip": 1.00171292, + "balance_loss_mlp": 1.00031328, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.4982100717744344, + "language_loss": 0.71660823, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73896408, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.6028730869293213 + }, + { + "auxiliary_loss_clip": 0.01150161, + "auxiliary_loss_mlp": 0.01102433, + "balance_loss_clip": 1.00187933, + "balance_loss_mlp": 1.00058079, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.9684289034168476, + "language_loss": 0.80962503, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.832151, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.640685558319092 + }, + { + "auxiliary_loss_clip": 0.01165141, + "auxiliary_loss_mlp": 0.01103383, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.000386, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.5072412094287393, + "language_loss": 0.67704368, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69972891, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.552180528640747 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00043738, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 3.233784971452417, + "language_loss": 0.76143861, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78380626, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.5620362758636475 + }, + { + "auxiliary_loss_clip": 0.01150792, + "auxiliary_loss_mlp": 0.01103863, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00048447, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.640521178654097, + "language_loss": 0.73114651, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.7536931, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.5116870403289795 + }, + { + "auxiliary_loss_clip": 0.01130462, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00055504, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 2.261619112360183, + "language_loss": 0.80498493, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82732511, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.591909170150757 + }, + { + "auxiliary_loss_clip": 0.01117136, + "auxiliary_loss_mlp": 0.01102377, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00042927, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 1.8686676159823292, + "language_loss": 0.82254827, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84474337, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.5846900939941406 + }, + { + "auxiliary_loss_clip": 0.01150452, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00051737, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.5696711047212644, + "language_loss": 0.71860188, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.74112725, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.580587148666382 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01103149, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.3953575349428622, + "language_loss": 0.69346887, + "learning_rate": 2.562945671948058e-08, + "loss": 0.7159943, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 3.9273622035980225 + }, + { + "auxiliary_loss_clip": 0.01135311, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_clip": 1.00168204, + "balance_loss_mlp": 1.00030363, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.9103042037127482, + "language_loss": 0.75191653, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77429503, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.5978879928588867 + }, + { + "auxiliary_loss_clip": 0.01114856, + "auxiliary_loss_mlp": 0.0110298, + "balance_loss_clip": 1.0017314, + "balance_loss_mlp": 1.00065124, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.4065842793946806, + "language_loss": 0.80135387, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82353222, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.6267895698547363 + }, + { + "auxiliary_loss_clip": 0.01135757, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00045753, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 1.9673094302208238, + "language_loss": 0.70831037, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.7306996, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.632024049758911 + }, + { + "auxiliary_loss_clip": 0.01116704, + "auxiliary_loss_mlp": 0.01103123, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00041199, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.6778971553689863, + "language_loss": 0.65906179, + "learning_rate": 2.538145713158446e-08, + "loss": 0.68126011, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.649319648742676 + }, + { + "auxiliary_loss_clip": 0.01148226, + "auxiliary_loss_mlp": 0.01102879, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00054979, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.6025977620822023, + "language_loss": 0.70497936, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72749043, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.5942914485931396 + }, + { + "auxiliary_loss_clip": 0.01148216, + "auxiliary_loss_mlp": 0.01101794, + "balance_loss_clip": 1.00187743, + "balance_loss_mlp": 1.00046647, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.437022722241697, + "language_loss": 0.63133514, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65383524, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.568201780319214 + }, + { + "auxiliary_loss_clip": 0.01131464, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.00175989, + "balance_loss_mlp": 1.00046396, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.7076612942675258, + "language_loss": 0.58766103, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61000735, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 2.6518352031707764 + }, + { + "auxiliary_loss_clip": 0.01164921, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00062442, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.3326217290165596, + "language_loss": 0.73630375, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75898343, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.526221752166748 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.01103048, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00052834, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.880485310841017, + "language_loss": 0.60309708, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62544286, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.5853230953216553 + }, + { + "auxiliary_loss_clip": 0.01164993, + "auxiliary_loss_mlp": 0.011035, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00059843, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.8078373746147296, + "language_loss": 0.69577479, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71845973, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.4604835510253906 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01102702, + "balance_loss_clip": 1.00171161, + "balance_loss_mlp": 1.00037265, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.685258321091396, + "language_loss": 0.73856413, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76060879, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 4.044937372207642 + }, + { + "auxiliary_loss_clip": 0.01133277, + "auxiliary_loss_mlp": 0.01103793, + "balance_loss_clip": 1.0019027, + "balance_loss_mlp": 1.00060582, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.7225834003402463, + "language_loss": 0.78651261, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80888331, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.5760533809661865 + }, + { + "auxiliary_loss_clip": 0.01117275, + "auxiliary_loss_mlp": 0.01103134, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00051916, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.3490455519359619, + "language_loss": 0.71298838, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73519242, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 4.153649568557739 + }, + { + "auxiliary_loss_clip": 0.01148264, + "auxiliary_loss_mlp": 0.01102234, + "balance_loss_clip": 1.00193059, + "balance_loss_mlp": 1.00047708, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.477992451129876, + "language_loss": 0.66253722, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68504226, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.5416276454925537 + }, + { + "auxiliary_loss_clip": 0.01150513, + "auxiliary_loss_mlp": 0.0110169, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00050497, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.5904722407713512, + "language_loss": 0.77279431, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79531634, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.5229156017303467 + }, + { + "auxiliary_loss_clip": 0.01165159, + "auxiliary_loss_mlp": 0.01103875, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00040114, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.3995972640931673, + "language_loss": 0.73360139, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75629175, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 3.9812052249908447 + }, + { + "auxiliary_loss_clip": 0.01144349, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_clip": 1.00144005, + "balance_loss_mlp": 0.99998999, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8491008019310972, + "language_loss": 0.53429663, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55652964, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.0563647747039795 + }, + { + "auxiliary_loss_clip": 0.01131664, + "auxiliary_loss_mlp": 0.01103054, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.0005343, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.122912413974423, + "language_loss": 0.72599822, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74834538, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.652407169342041 + }, + { + "auxiliary_loss_clip": 0.01131437, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.7354477279932308, + "language_loss": 0.74302089, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76535636, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.5759756565093994 + }, + { + "auxiliary_loss_clip": 0.01112324, + "auxiliary_loss_mlp": 0.01101464, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00056481, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.5637769105124164, + "language_loss": 0.73137987, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75351775, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.643955945968628 + }, + { + "auxiliary_loss_clip": 0.01150184, + "auxiliary_loss_mlp": 0.01102858, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00043321, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.8679790009958868, + "language_loss": 0.61430848, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63683891, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.553083658218384 + }, + { + "auxiliary_loss_clip": 0.01148697, + "auxiliary_loss_mlp": 0.01103131, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00042069, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 2.902108289224959, + "language_loss": 0.72515637, + "learning_rate": 2.428028693179729e-08, + "loss": 0.7476747, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.525768280029297 + }, + { + "auxiliary_loss_clip": 0.01099589, + "auxiliary_loss_mlp": 0.01101079, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00037098, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.5767955692742206, + "language_loss": 0.65266263, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67466938, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.6239147186279297 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00068831, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.8023892560206094, + "language_loss": 0.77563488, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.79816329, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.5157437324523926 + }, + { + "auxiliary_loss_clip": 0.01116773, + "auxiliary_loss_mlp": 0.01102278, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00042534, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.126890390206916, + "language_loss": 0.74886596, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77105653, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.5782480239868164 + }, + { + "auxiliary_loss_clip": 0.01148789, + "auxiliary_loss_mlp": 0.01104215, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00064611, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.8902195792421028, + "language_loss": 0.76233721, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78486729, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.550090789794922 + }, + { + "auxiliary_loss_clip": 0.01133719, + "auxiliary_loss_mlp": 0.01102747, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00041819, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 2.0400894263191334, + "language_loss": 0.66517818, + "learning_rate": 2.397871361623238e-08, + "loss": 0.6875428, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 2.541037082672119 + }, + { + "auxiliary_loss_clip": 0.01116774, + "auxiliary_loss_mlp": 0.01102236, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00038373, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.6454004619726976, + "language_loss": 0.70459282, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72678292, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.6487927436828613 + }, + { + "auxiliary_loss_clip": 0.01164832, + "auxiliary_loss_mlp": 0.0110269, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00045574, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.8011876863977172, + "language_loss": 0.73535454, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.7580297, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 2.506779909133911 + }, + { + "auxiliary_loss_clip": 0.0112091, + "auxiliary_loss_mlp": 0.0110374, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00045681, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.6544272520077097, + "language_loss": 0.7818054, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80405194, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.650538444519043 + }, + { + "auxiliary_loss_clip": 0.01118899, + "auxiliary_loss_mlp": 0.01101973, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00050259, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.4958059332533917, + "language_loss": 0.80558467, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82779342, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.618629217147827 + }, + { + "auxiliary_loss_clip": 0.01131472, + "auxiliary_loss_mlp": 0.011019, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00052452, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 2.9964100398627944, + "language_loss": 0.73069841, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75303209, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.57869553565979 + }, + { + "auxiliary_loss_clip": 0.01131378, + "auxiliary_loss_mlp": 0.01102044, + "balance_loss_clip": 1.00179636, + "balance_loss_mlp": 1.00038218, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.7338262472196597, + "language_loss": 0.78657347, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.80890769, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.5494370460510254 + }, + { + "auxiliary_loss_clip": 0.01131643, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00057101, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.7216577894718368, + "language_loss": 0.72379136, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74613774, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.5862228870391846 + }, + { + "auxiliary_loss_clip": 0.01129081, + "auxiliary_loss_mlp": 0.00747337, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00039375, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.489114254138001, + "language_loss": 0.78124052, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80000472, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.592135429382324 + }, + { + "auxiliary_loss_clip": 0.0111699, + "auxiliary_loss_mlp": 0.01103391, + "balance_loss_clip": 1.00165224, + "balance_loss_mlp": 1.00048923, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.212807534693592, + "language_loss": 0.70407414, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72627795, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.612626314163208 + }, + { + "auxiliary_loss_clip": 0.01117147, + "auxiliary_loss_mlp": 0.01103505, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00060368, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.4896035256392925, + "language_loss": 0.75514352, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77735007, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.6196670532226562 + }, + { + "auxiliary_loss_clip": 0.01117163, + "auxiliary_loss_mlp": 0.01101905, + "balance_loss_clip": 1.00162637, + "balance_loss_mlp": 1.00033927, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.6481643922203773, + "language_loss": 0.78125894, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80344969, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 4.004364013671875 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01102925, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00069106, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.901243244124305, + "language_loss": 0.77952349, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80172282, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.597372531890869 + }, + { + "auxiliary_loss_clip": 0.01133621, + "auxiliary_loss_mlp": 0.01104445, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00059009, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.770339804497934, + "language_loss": 0.71988881, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74226952, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.591881275177002 + }, + { + "auxiliary_loss_clip": 0.01148407, + "auxiliary_loss_mlp": 0.01103866, + "balance_loss_clip": 1.00183451, + "balance_loss_mlp": 1.00058293, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.5538465587343344, + "language_loss": 0.75224906, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77477175, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 2.545341968536377 + }, + { + "auxiliary_loss_clip": 0.01134095, + "auxiliary_loss_mlp": 0.01103244, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.0972742831173687, + "language_loss": 0.72495627, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74732971, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.5828592777252197 + }, + { + "auxiliary_loss_clip": 0.01133774, + "auxiliary_loss_mlp": 0.01102865, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00034475, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.856412113493254, + "language_loss": 0.79455781, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81692421, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.6102242469787598 + }, + { + "auxiliary_loss_clip": 0.01150085, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.00046766, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.5578850419091401, + "language_loss": 0.59642899, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61896068, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.760770797729492 + }, + { + "auxiliary_loss_clip": 0.01134352, + "auxiliary_loss_mlp": 0.01101299, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00049543, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.7121542832555436, + "language_loss": 0.72696328, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74931985, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.01131858, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_clip": 1.0016278, + "balance_loss_mlp": 1.00046706, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.4412589945872805, + "language_loss": 0.67745739, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69979912, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.6424527168273926 + }, + { + "auxiliary_loss_clip": 0.01165044, + "auxiliary_loss_mlp": 0.01102206, + "balance_loss_clip": 1.00192726, + "balance_loss_mlp": 1.0006398, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.685493807038339, + "language_loss": 0.76460546, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78727794, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.537038564682007 + }, + { + "auxiliary_loss_clip": 0.01135391, + "auxiliary_loss_mlp": 0.01102773, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00044358, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.7734373220775552, + "language_loss": 0.77878273, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80116439, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.5979115962982178 + }, + { + "auxiliary_loss_clip": 0.01145176, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.00120711, + "balance_loss_mlp": 0.99998963, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7069220993968783, + "language_loss": 0.62582982, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64807487, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 4.512548208236694 + }, + { + "auxiliary_loss_clip": 0.0109988, + "auxiliary_loss_mlp": 0.01101864, + "balance_loss_clip": 1.00172257, + "balance_loss_mlp": 1.00048876, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.3801624709516354, + "language_loss": 0.5658946, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58791202, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.646505117416382 + }, + { + "auxiliary_loss_clip": 0.01164761, + "auxiliary_loss_mlp": 0.01102007, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00044084, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.357956412167689, + "language_loss": 0.81764925, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84031689, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 3.8799009323120117 + }, + { + "auxiliary_loss_clip": 0.01098131, + "auxiliary_loss_mlp": 0.00747353, + "balance_loss_clip": 1.00141358, + "balance_loss_mlp": 1.00044346, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.5803104413307252, + "language_loss": 0.66757607, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68603092, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.655893325805664 + }, + { + "auxiliary_loss_clip": 0.01149847, + "auxiliary_loss_mlp": 0.01103697, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00060463, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.5962606927109617, + "language_loss": 0.6533919, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67592734, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 4.058238506317139 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01101628, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00044346, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.598601760017505, + "language_loss": 0.6777488, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69995058, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.6122448444366455 + }, + { + "auxiliary_loss_clip": 0.01134922, + "auxiliary_loss_mlp": 0.01102314, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.0004617, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 6.524244148546084, + "language_loss": 0.78561121, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80798358, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.703503131866455 + }, + { + "auxiliary_loss_clip": 0.01116419, + "auxiliary_loss_mlp": 0.01101773, + "balance_loss_clip": 1.00173151, + "balance_loss_mlp": 1.00049269, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 2.659400074019007, + "language_loss": 0.59103817, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61322016, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.6221742630004883 + }, + { + "auxiliary_loss_clip": 0.01116783, + "auxiliary_loss_mlp": 0.0110268, + "balance_loss_clip": 1.0017221, + "balance_loss_mlp": 1.00044596, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 2.887251188227872, + "language_loss": 0.69947201, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72166669, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.6088106632232666 + }, + { + "auxiliary_loss_clip": 0.01134799, + "auxiliary_loss_mlp": 0.01102865, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00044048, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.232775960818554, + "language_loss": 0.85167301, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87404966, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.605849266052246 + }, + { + "auxiliary_loss_clip": 0.01143292, + "auxiliary_loss_mlp": 0.01078934, + "balance_loss_clip": 1.00121069, + "balance_loss_mlp": 0.9999696, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7701975704707326, + "language_loss": 0.61855197, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.64077425, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.161038637161255 + }, + { + "auxiliary_loss_clip": 0.01114913, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_clip": 1.0015415, + "balance_loss_mlp": 1.00039208, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.8895301059900385, + "language_loss": 0.59965694, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62183142, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.6081652641296387 + }, + { + "auxiliary_loss_clip": 0.01120765, + "auxiliary_loss_mlp": 0.00747479, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00047684, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 2.5238073790962194, + "language_loss": 0.71152377, + "learning_rate": 2.197770872795579e-08, + "loss": 0.73020625, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.652294158935547 + }, + { + "auxiliary_loss_clip": 0.01114981, + "auxiliary_loss_mlp": 0.01101738, + "balance_loss_clip": 1.00153089, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 1.8323893120737849, + "language_loss": 0.76605046, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78821766, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 2.6317358016967773 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01103765, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.00048232, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.6927749826576535, + "language_loss": 0.57788569, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60042536, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.598566770553589 + }, + { + "auxiliary_loss_clip": 0.01133572, + "auxiliary_loss_mlp": 0.01103311, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00040948, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.416419764642518, + "language_loss": 0.74788517, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.77025402, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.600301504135132 + }, + { + "auxiliary_loss_clip": 0.01165106, + "auxiliary_loss_mlp": 0.01103401, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.00049973, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.838402055066229, + "language_loss": 0.62373775, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64642286, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.5236921310424805 + }, + { + "auxiliary_loss_clip": 0.01132985, + "auxiliary_loss_mlp": 0.01101896, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00051999, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9000246033979198, + "language_loss": 0.89437246, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91672134, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 2.5884768962860107 + }, + { + "auxiliary_loss_clip": 0.0116511, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00053644, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.7465660437839676, + "language_loss": 0.67704546, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69973195, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.567068099975586 + }, + { + "auxiliary_loss_clip": 0.01148201, + "auxiliary_loss_mlp": 0.01104295, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00053501, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.7838894175587834, + "language_loss": 0.69062644, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.7131514, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.576768398284912 + }, + { + "auxiliary_loss_clip": 0.01117398, + "auxiliary_loss_mlp": 0.01103147, + "balance_loss_clip": 1.00165439, + "balance_loss_mlp": 1.00053215, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.7715401134588327, + "language_loss": 0.71195936, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.73416483, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.63903546333313 + }, + { + "auxiliary_loss_clip": 0.01164891, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00185764, + "balance_loss_mlp": 1.00034869, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.2779117121079584, + "language_loss": 0.68325615, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70592993, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.5405280590057373 + }, + { + "auxiliary_loss_clip": 0.01117175, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.00172353, + "balance_loss_mlp": 1.00032711, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.7496009702164157, + "language_loss": 0.85372716, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.87237144, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.6806201934814453 + }, + { + "auxiliary_loss_clip": 0.01087748, + "auxiliary_loss_mlp": 0.01103187, + "balance_loss_clip": 1.00165951, + "balance_loss_mlp": 1.00047588, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.7659197254125498, + "language_loss": 0.72104776, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74295712, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.801452875137329 + }, + { + "auxiliary_loss_clip": 0.01148535, + "auxiliary_loss_mlp": 0.01102276, + "balance_loss_clip": 1.00185835, + "balance_loss_mlp": 1.00051939, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.719828481360021, + "language_loss": 0.71386957, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.7363776, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.5093698501586914 + }, + { + "auxiliary_loss_clip": 0.01133112, + "auxiliary_loss_mlp": 0.01103011, + "balance_loss_clip": 1.00175369, + "balance_loss_mlp": 1.00058615, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.5751849781407063, + "language_loss": 0.66006529, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68242651, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 2.9153780937194824 + }, + { + "auxiliary_loss_clip": 0.01148268, + "auxiliary_loss_mlp": 0.01103225, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00051475, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.089627277914454, + "language_loss": 0.78095925, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80347425, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.510460376739502 + }, + { + "auxiliary_loss_clip": 0.01164983, + "auxiliary_loss_mlp": 0.01103771, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00039268, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.6282178882083655, + "language_loss": 0.7790072, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.80169475, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 3.958468437194824 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00047731, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.82508430835593, + "language_loss": 0.69989043, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72256684, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.5134429931640625 + }, + { + "auxiliary_loss_clip": 0.01116712, + "auxiliary_loss_mlp": 0.01104398, + "balance_loss_clip": 1.00165772, + "balance_loss_mlp": 1.00044799, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.8003367236158776, + "language_loss": 0.72728407, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74949515, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.6104886531829834 + }, + { + "auxiliary_loss_clip": 0.01135506, + "auxiliary_loss_mlp": 0.01102195, + "balance_loss_clip": 1.00180078, + "balance_loss_mlp": 1.00034308, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 3.2402691788564324, + "language_loss": 0.5666908, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58906782, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.589207649230957 + }, + { + "auxiliary_loss_clip": 0.01143751, + "auxiliary_loss_mlp": 0.01079322, + "balance_loss_clip": 1.00107527, + "balance_loss_mlp": 0.99997663, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7112160051835846, + "language_loss": 0.57865441, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.60088515, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.1842732429504395 + }, + { + "auxiliary_loss_clip": 0.01165014, + "auxiliary_loss_mlp": 0.01103144, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00043321, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.2831250817991133, + "language_loss": 0.66859031, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69127184, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.5538063049316406 + }, + { + "auxiliary_loss_clip": 0.01165054, + "auxiliary_loss_mlp": 0.01102458, + "balance_loss_clip": 1.00199592, + "balance_loss_mlp": 1.00060582, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.5285160605048567, + "language_loss": 0.74074471, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76341981, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.54406476020813 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_clip": 1.00175357, + "balance_loss_mlp": 1.00048554, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.755614727085303, + "language_loss": 0.7799648, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80214524, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.6800501346588135 + }, + { + "auxiliary_loss_clip": 0.01164913, + "auxiliary_loss_mlp": 0.0110246, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00041652, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.3752498745786168, + "language_loss": 0.69995993, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72263366, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.5226173400878906 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00039899, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 1.7326072054216815, + "language_loss": 0.66083401, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67963409, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.5630548000335693 + }, + { + "auxiliary_loss_clip": 0.01148231, + "auxiliary_loss_mlp": 0.01102767, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00043809, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.7234973052691798, + "language_loss": 0.81675386, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83926386, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 4.025444030761719 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01103429, + "balance_loss_clip": 1.00164413, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.7864997488515248, + "language_loss": 0.72640932, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74876165, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.637460231781006 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01102741, + "balance_loss_clip": 1.00188649, + "balance_loss_mlp": 1.00050688, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.827293277035593, + "language_loss": 0.79649341, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81916976, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 3.881033420562744 + }, + { + "auxiliary_loss_clip": 0.01133488, + "auxiliary_loss_mlp": 0.01103294, + "balance_loss_clip": 1.00172186, + "balance_loss_mlp": 1.00029755, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.5688500291535294, + "language_loss": 0.72676551, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74913335, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.613093137741089 + }, + { + "auxiliary_loss_clip": 0.01112527, + "auxiliary_loss_mlp": 0.01079383, + "balance_loss_clip": 1.00107336, + "balance_loss_mlp": 1.00003707, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.9070328244854499, + "language_loss": 0.52356267, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54548174, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.144056797027588 + }, + { + "auxiliary_loss_clip": 0.01148614, + "auxiliary_loss_mlp": 0.01104593, + "balance_loss_clip": 1.00199115, + "balance_loss_mlp": 1.00045133, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.0333702474383304, + "language_loss": 0.68610346, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70863557, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 3.990004539489746 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.00747388, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.0004077, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.189998155303543, + "language_loss": 0.83201301, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85098404, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.533417224884033 + }, + { + "auxiliary_loss_clip": 0.01115824, + "auxiliary_loss_mlp": 0.01078919, + "balance_loss_clip": 1.00156498, + "balance_loss_mlp": 0.99995434, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7080692512264611, + "language_loss": 0.54319179, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56513923, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.2693798542022705 + }, + { + "auxiliary_loss_clip": 0.01134514, + "auxiliary_loss_mlp": 0.0110154, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.0005455, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.5970335827681974, + "language_loss": 0.8522234, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87458396, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.5733258724212646 + }, + { + "auxiliary_loss_clip": 0.01133285, + "auxiliary_loss_mlp": 0.00747305, + "balance_loss_clip": 1.00174487, + "balance_loss_mlp": 1.00037479, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.9156310463172248, + "language_loss": 0.80282402, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82162994, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.5950753688812256 + }, + { + "auxiliary_loss_clip": 0.01150575, + "auxiliary_loss_mlp": 0.01103482, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00048542, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.1490944966418284, + "language_loss": 0.59705603, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.6195966, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.5640709400177 + }, + { + "auxiliary_loss_clip": 0.01148134, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00052667, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 2.263469678654289, + "language_loss": 0.70929998, + "learning_rate": 1.995350770979254e-08, + "loss": 0.73180896, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.5478553771972656 + }, + { + "auxiliary_loss_clip": 0.01099552, + "auxiliary_loss_mlp": 0.01103457, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00036478, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 2.751834872990926, + "language_loss": 0.70916742, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73119748, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.6483845710754395 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01101948, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00057209, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.7509204197908648, + "language_loss": 0.69983888, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72199845, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.660127639770508 + }, + { + "auxiliary_loss_clip": 0.01130994, + "auxiliary_loss_mlp": 0.00747315, + "balance_loss_clip": 1.0017463, + "balance_loss_mlp": 1.00037146, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.7648761103583013, + "language_loss": 0.82758236, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84636545, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.5931174755096436 + }, + { + "auxiliary_loss_clip": 0.01148069, + "auxiliary_loss_mlp": 0.01102354, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.0734049842616114, + "language_loss": 0.67141944, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69392365, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.631439208984375 + }, + { + "auxiliary_loss_clip": 0.01148484, + "auxiliary_loss_mlp": 0.01103521, + "balance_loss_clip": 1.00193465, + "balance_loss_mlp": 1.00061917, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.844795701672466, + "language_loss": 0.74360251, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76612258, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.5406363010406494 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01102945, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.0004251, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 1.7059875279980758, + "language_loss": 0.69205141, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71456605, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.5387468338012695 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01103868, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00058484, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.253578069220733, + "language_loss": 0.71740347, + "learning_rate": 1.95712100769696e-08, + "loss": 0.73979926, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 2.560089349746704 + }, + { + "auxiliary_loss_clip": 0.01067028, + "auxiliary_loss_mlp": 0.0110293, + "balance_loss_clip": 1.00153077, + "balance_loss_mlp": 1.00050521, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.9482377793795336, + "language_loss": 0.73495603, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75665557, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.739591598510742 + }, + { + "auxiliary_loss_clip": 0.01164982, + "auxiliary_loss_mlp": 0.01102705, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00047123, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.417411664505173, + "language_loss": 0.67125154, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69392842, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 2.4782702922821045 + }, + { + "auxiliary_loss_clip": 0.01148058, + "auxiliary_loss_mlp": 0.01102857, + "balance_loss_clip": 1.00182998, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.6394825804589883, + "language_loss": 0.6413033, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.6638124, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.568204879760742 + }, + { + "auxiliary_loss_clip": 0.01164794, + "auxiliary_loss_mlp": 0.01101719, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00043929, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.761461519537694, + "language_loss": 0.80705249, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82971764, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 2.5164875984191895 + }, + { + "auxiliary_loss_clip": 0.01133724, + "auxiliary_loss_mlp": 0.0110212, + "balance_loss_clip": 1.00193942, + "balance_loss_mlp": 1.00060141, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.6661556430708073, + "language_loss": 0.72737128, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.74972969, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.549546480178833 + }, + { + "auxiliary_loss_clip": 0.01114891, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_clip": 1.00111246, + "balance_loss_mlp": 0.99998134, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6309296125394644, + "language_loss": 0.53123617, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55317843, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.302591323852539 + }, + { + "auxiliary_loss_clip": 0.01150461, + "auxiliary_loss_mlp": 0.01103444, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00044751, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 3.341079914848594, + "language_loss": 0.75511634, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77765542, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.53049373626709 + }, + { + "auxiliary_loss_clip": 0.0111516, + "auxiliary_loss_mlp": 0.01103089, + "balance_loss_clip": 1.0017035, + "balance_loss_mlp": 1.00047326, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.7089912652269454, + "language_loss": 0.78906244, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81124496, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.6040875911712646 + }, + { + "auxiliary_loss_clip": 0.01149895, + "auxiliary_loss_mlp": 0.01103668, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00038564, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 1.7243651404209417, + "language_loss": 0.50888002, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.5314157, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.6510729789733887 + }, + { + "auxiliary_loss_clip": 0.01102243, + "auxiliary_loss_mlp": 0.01103312, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00050628, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.091463934931413, + "language_loss": 0.84171569, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86377132, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 4.027064323425293 + }, + { + "auxiliary_loss_clip": 0.01133415, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_clip": 1.00168538, + "balance_loss_mlp": 1.00058281, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.5478949781584668, + "language_loss": 0.75202894, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77438653, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.6403796672821045 + }, + { + "auxiliary_loss_clip": 0.01131902, + "auxiliary_loss_mlp": 0.01103018, + "balance_loss_clip": 1.00169635, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 1.9155617841585217, + "language_loss": 0.86139894, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88374817, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.5967299938201904 + }, + { + "auxiliary_loss_clip": 0.01118147, + "auxiliary_loss_mlp": 0.01103681, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00058866, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.8873029170320341, + "language_loss": 0.75646412, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77868247, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.63458251953125 + }, + { + "auxiliary_loss_clip": 0.01131714, + "auxiliary_loss_mlp": 0.01101824, + "balance_loss_clip": 1.00160801, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.833448221414648, + "language_loss": 0.77464676, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79698217, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.5827667713165283 + }, + { + "auxiliary_loss_clip": 0.01103053, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00034928, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.768514729505089, + "language_loss": 0.68471539, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70678318, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.7537500858306885 + }, + { + "auxiliary_loss_clip": 0.01132934, + "auxiliary_loss_mlp": 0.01103124, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.00041366, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.6273033834961559, + "language_loss": 0.81997025, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84233087, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.5746424198150635 + }, + { + "auxiliary_loss_clip": 0.01118768, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00051975, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.4812104991480337, + "language_loss": 0.72171414, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74393511, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.664891481399536 + }, + { + "auxiliary_loss_clip": 0.01083045, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_clip": 1.00160432, + "balance_loss_mlp": 1.00037742, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.6489640347878227, + "language_loss": 0.62199354, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64384913, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.6653847694396973 + }, + { + "auxiliary_loss_clip": 0.01164866, + "auxiliary_loss_mlp": 0.01102176, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00051498, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.8056673482625776, + "language_loss": 0.68741363, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71008408, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.504606246948242 + }, + { + "auxiliary_loss_clip": 0.01116527, + "auxiliary_loss_mlp": 0.01103442, + "balance_loss_clip": 1.0016346, + "balance_loss_mlp": 1.00073147, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.9740931892986737, + "language_loss": 0.75144458, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77364421, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 4.024260759353638 + }, + { + "auxiliary_loss_clip": 0.01099836, + "auxiliary_loss_mlp": 0.01079336, + "balance_loss_clip": 1.00123453, + "balance_loss_mlp": 0.99999028, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7024182239724613, + "language_loss": 0.57321501, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.5950067, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 4.748335838317871 + }, + { + "auxiliary_loss_clip": 0.01160386, + "auxiliary_loss_mlp": 0.0074645, + "balance_loss_clip": 1.00116503, + "balance_loss_mlp": 1.00110173, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9147856205709377, + "language_loss": 0.65945339, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67852175, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.0284626483917236 + }, + { + "auxiliary_loss_clip": 0.01129323, + "auxiliary_loss_mlp": 0.0108022, + "balance_loss_clip": 1.0017364, + "balance_loss_mlp": 1.00011182, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.779685884831015, + "language_loss": 0.57064271, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59273815, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.119684934616089 + }, + { + "auxiliary_loss_clip": 0.0108383, + "auxiliary_loss_mlp": 0.01103158, + "balance_loss_clip": 1.00167775, + "balance_loss_mlp": 1.00054288, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.6515813189709578, + "language_loss": 0.78322983, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80509973, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 4.109342098236084 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01103274, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00037265, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 3.4319913733041427, + "language_loss": 0.68247175, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70500779, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.564751625061035 + }, + { + "auxiliary_loss_clip": 0.0111595, + "auxiliary_loss_mlp": 0.0110248, + "balance_loss_clip": 1.00165033, + "balance_loss_mlp": 1.00043643, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.1939855900261978, + "language_loss": 0.65201795, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.6742022, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.6403467655181885 + }, + { + "auxiliary_loss_clip": 0.01133713, + "auxiliary_loss_mlp": 0.01102233, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00047565, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.4941340290698377, + "language_loss": 0.73705471, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75941414, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.6292362213134766 + }, + { + "auxiliary_loss_clip": 0.01165121, + "auxiliary_loss_mlp": 0.01103632, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.5939239503963707, + "language_loss": 0.72957313, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75226074, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.4915096759796143 + }, + { + "auxiliary_loss_clip": 0.01131719, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_clip": 1.00170577, + "balance_loss_mlp": 1.00050294, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.5186710078725205, + "language_loss": 0.71162415, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73395908, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 2.63968563079834 + }, + { + "auxiliary_loss_clip": 0.01165032, + "auxiliary_loss_mlp": 0.01104016, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.0005424, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.7270297453889194, + "language_loss": 0.72072387, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74341434, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.5968220233917236 + }, + { + "auxiliary_loss_clip": 0.01148409, + "auxiliary_loss_mlp": 0.01103874, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.0005908, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.7290668601836021, + "language_loss": 0.68417525, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70669806, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.5481765270233154 + }, + { + "auxiliary_loss_clip": 0.01164839, + "auxiliary_loss_mlp": 0.01102971, + "balance_loss_clip": 1.00167847, + "balance_loss_mlp": 1.00064158, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 2.823164508526495, + "language_loss": 0.66211611, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68479425, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.4687492847442627 + }, + { + "auxiliary_loss_clip": 0.01060505, + "auxiliary_loss_mlp": 0.01079794, + "balance_loss_clip": 1.00122499, + "balance_loss_mlp": 1.000067, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7525583149965996, + "language_loss": 0.61869383, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.64009678, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 3.3811867237091064 + }, + { + "auxiliary_loss_clip": 0.0116478, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.0005486, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.750306925844916, + "language_loss": 0.74980539, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77247721, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 2.5456972122192383 + }, + { + "auxiliary_loss_clip": 0.01133468, + "auxiliary_loss_mlp": 0.01102747, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00041807, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.2218173106769723, + "language_loss": 0.6984762, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72083837, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.57380747795105 + }, + { + "auxiliary_loss_clip": 0.01095992, + "auxiliary_loss_mlp": 0.01102778, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00044918, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 2.1244333375056916, + "language_loss": 0.78506887, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80705655, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.664882183074951 + }, + { + "auxiliary_loss_clip": 0.01165086, + "auxiliary_loss_mlp": 0.0110394, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.7101046159146056, + "language_loss": 0.68456018, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70725042, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.552304983139038 + }, + { + "auxiliary_loss_clip": 0.01133595, + "auxiliary_loss_mlp": 0.01102161, + "balance_loss_clip": 1.00173461, + "balance_loss_mlp": 1.00040424, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.647541328919536, + "language_loss": 0.8623935, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88475108, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.600473642349243 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.0005734, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.054597112556841, + "language_loss": 0.79885584, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.82123411, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 2.66523814201355 + }, + { + "auxiliary_loss_clip": 0.01148188, + "auxiliary_loss_mlp": 0.01103342, + "balance_loss_clip": 1.0018698, + "balance_loss_mlp": 1.00044012, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.72997766265317, + "language_loss": 0.69425738, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71677268, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.6075057983398438 + }, + { + "auxiliary_loss_clip": 0.01083031, + "auxiliary_loss_mlp": 0.0110265, + "balance_loss_clip": 1.00148988, + "balance_loss_mlp": 1.00060666, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 2.5219075063037417, + "language_loss": 0.57951999, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60137677, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 2.727457046508789 + }, + { + "auxiliary_loss_clip": 0.011483, + "auxiliary_loss_mlp": 0.01103401, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.00049984, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.8875687537630794, + "language_loss": 0.73795438, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.7604714, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.6338610649108887 + }, + { + "auxiliary_loss_clip": 0.01131555, + "auxiliary_loss_mlp": 0.01102622, + "balance_loss_clip": 1.00173211, + "balance_loss_mlp": 1.00038862, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.71554111975982, + "language_loss": 0.62813663, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65047842, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.5598931312561035 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01102884, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00045991, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.7275151694109323, + "language_loss": 0.60082996, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62301016, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.01115763, + "auxiliary_loss_mlp": 0.01103526, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00043416, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.986086429682185, + "language_loss": 0.74307644, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.7652694, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.675290584564209 + }, + { + "auxiliary_loss_clip": 0.01132914, + "auxiliary_loss_mlp": 0.00747254, + "balance_loss_clip": 1.00164461, + "balance_loss_mlp": 1.0003159, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.500941687182266, + "language_loss": 0.74738634, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76618803, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.5863096714019775 + }, + { + "auxiliary_loss_clip": 0.01150632, + "auxiliary_loss_mlp": 0.01102686, + "balance_loss_clip": 1.00196946, + "balance_loss_mlp": 1.00045168, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 4.9901998743155636, + "language_loss": 0.65398377, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67651695, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.54286789894104 + }, + { + "auxiliary_loss_clip": 0.01164934, + "auxiliary_loss_mlp": 0.01102097, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.516743900540595, + "language_loss": 0.77733207, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80000234, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 3.915452480316162 + }, + { + "auxiliary_loss_clip": 0.01098953, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_clip": 1.00173151, + "balance_loss_mlp": 1.00043297, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.9604019068472094, + "language_loss": 0.75650293, + "learning_rate": 1.699820008484698e-08, + "loss": 0.77851629, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 2.6089093685150146 + }, + { + "auxiliary_loss_clip": 0.01132734, + "auxiliary_loss_mlp": 0.01103682, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00049412, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 1.8583800720047103, + "language_loss": 0.71439886, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.736763, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.6002962589263916 + }, + { + "auxiliary_loss_clip": 0.0113156, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_clip": 1.00176132, + "balance_loss_mlp": 1.00042069, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.5760233881003225, + "language_loss": 0.73959279, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76192635, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.58948016166687 + }, + { + "auxiliary_loss_clip": 0.01108916, + "auxiliary_loss_mlp": 0.01079005, + "balance_loss_clip": 1.00116515, + "balance_loss_mlp": 1.00004101, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8808171443775697, + "language_loss": 0.57601494, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59789413, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.1645328998565674 + }, + { + "auxiliary_loss_clip": 0.01164876, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00049758, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.667953134066936, + "language_loss": 0.78602302, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.80870008, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.521116018295288 + }, + { + "auxiliary_loss_clip": 0.01133692, + "auxiliary_loss_mlp": 0.01102577, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00043797, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.601552225052053, + "language_loss": 0.79542267, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81778538, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.592268466949463 + }, + { + "auxiliary_loss_clip": 0.01104671, + "auxiliary_loss_mlp": 0.01104005, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00043571, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 5.097050938088911, + "language_loss": 0.80561095, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82769775, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.6246144771575928 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01104094, + "balance_loss_clip": 1.00167465, + "balance_loss_mlp": 1.00066853, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.2444271498712456, + "language_loss": 0.67451847, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.6965633, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.7324671745300293 + }, + { + "auxiliary_loss_clip": 0.0115051, + "auxiliary_loss_mlp": 0.0110247, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00061738, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.803158893067498, + "language_loss": 0.79396522, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81649506, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.5342814922332764 + }, + { + "auxiliary_loss_clip": 0.01147763, + "auxiliary_loss_mlp": 0.01102517, + "balance_loss_clip": 1.0019151, + "balance_loss_mlp": 1.00056911, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.6148104510640318, + "language_loss": 0.77409863, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79660141, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 4.498270034790039 + }, + { + "auxiliary_loss_clip": 0.01133557, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.0003953, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 2.40012225572501, + "language_loss": 0.66981095, + "learning_rate": 1.64952712054669e-08, + "loss": 0.6921795, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 3.9686365127563477 + }, + { + "auxiliary_loss_clip": 0.01147737, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00039506, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.0671250077916126, + "language_loss": 0.76509416, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78404546, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.504577398300171 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00053263, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.5762212754142102, + "language_loss": 0.6928941, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71496129, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.6725635528564453 + }, + { + "auxiliary_loss_clip": 0.01148586, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_clip": 1.00170994, + "balance_loss_mlp": 1.00046694, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.7023538090086936, + "language_loss": 0.67851496, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70102876, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.564563274383545 + }, + { + "auxiliary_loss_clip": 0.01164763, + "auxiliary_loss_mlp": 0.01102217, + "balance_loss_clip": 1.00182521, + "balance_loss_mlp": 1.0003649, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 3.065656480240595, + "language_loss": 0.55995452, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.58262432, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 3.931832790374756 + }, + { + "auxiliary_loss_clip": 0.01133265, + "auxiliary_loss_mlp": 0.01101623, + "balance_loss_clip": 1.00166786, + "balance_loss_mlp": 1.00043869, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.810408747037025, + "language_loss": 0.68479204, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70714092, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 2.6284337043762207 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01102372, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00061488, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.9190384751260088, + "language_loss": 0.81781507, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.8403362, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.496661424636841 + }, + { + "auxiliary_loss_clip": 0.01149913, + "auxiliary_loss_mlp": 0.01103722, + "balance_loss_clip": 1.00187457, + "balance_loss_mlp": 1.00043869, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.471069268440769, + "language_loss": 0.83195078, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85448712, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.501250743865967 + }, + { + "auxiliary_loss_clip": 0.01165124, + "auxiliary_loss_mlp": 0.01103502, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00050497, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 2.2671097413965486, + "language_loss": 0.80278188, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82546818, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.496448516845703 + }, + { + "auxiliary_loss_clip": 0.0115025, + "auxiliary_loss_mlp": 0.01103383, + "balance_loss_clip": 1.00176239, + "balance_loss_mlp": 1.00048137, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 2.1488742977064446, + "language_loss": 0.68352169, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70605803, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.5517218112945557 + }, + { + "auxiliary_loss_clip": 0.01148064, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00173903, + "balance_loss_mlp": 1.00041676, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.38378896126769, + "language_loss": 0.69262612, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71158081, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.5815634727478027 + }, + { + "auxiliary_loss_clip": 0.01110316, + "auxiliary_loss_mlp": 0.0107891, + "balance_loss_clip": 1.00117898, + "balance_loss_mlp": 0.99994546, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6663487668523337, + "language_loss": 0.53247881, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55437106, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.2974205017089844 + }, + { + "auxiliary_loss_clip": 0.01164947, + "auxiliary_loss_mlp": 0.01102974, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00045443, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 1.9558290694169602, + "language_loss": 0.67866182, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70134103, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.504575490951538 + }, + { + "auxiliary_loss_clip": 0.01115669, + "auxiliary_loss_mlp": 0.01101558, + "balance_loss_clip": 1.00158727, + "balance_loss_mlp": 1.00056386, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.5807724016505331, + "language_loss": 0.67436826, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69654053, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.593005418777466 + }, + { + "auxiliary_loss_clip": 0.01164968, + "auxiliary_loss_mlp": 0.01103178, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00056314, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.8780886664472964, + "language_loss": 0.7906692, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81335068, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.503319501876831 + }, + { + "auxiliary_loss_clip": 0.01096362, + "auxiliary_loss_mlp": 0.01103384, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00048232, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 3.917064344909279, + "language_loss": 0.63762593, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65962338, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.6354238986968994 + }, + { + "auxiliary_loss_clip": 0.01164809, + "auxiliary_loss_mlp": 0.01102161, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.6484566980050617, + "language_loss": 0.67096621, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.69363588, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.546018123626709 + }, + { + "auxiliary_loss_clip": 0.01148251, + "auxiliary_loss_mlp": 0.01101937, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00065637, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.6600233127969553, + "language_loss": 0.74445379, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76695567, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.5456809997558594 + }, + { + "auxiliary_loss_clip": 0.01129799, + "auxiliary_loss_mlp": 0.01079757, + "balance_loss_clip": 1.00118339, + "balance_loss_mlp": 1.0000298, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8831956772150997, + "language_loss": 0.63139623, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.6534918, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.0642249584198 + }, + { + "auxiliary_loss_clip": 0.01148077, + "auxiliary_loss_mlp": 0.01102774, + "balance_loss_clip": 1.00169706, + "balance_loss_mlp": 1.00044441, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 4.089277647311575, + "language_loss": 0.77585465, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79836315, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 2.5934267044067383 + }, + { + "auxiliary_loss_clip": 0.01164977, + "auxiliary_loss_mlp": 0.01104421, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00056541, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.213201184308747, + "language_loss": 0.84179997, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86449397, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.548482656478882 + }, + { + "auxiliary_loss_clip": 0.01132771, + "auxiliary_loss_mlp": 0.01102994, + "balance_loss_clip": 1.00160336, + "balance_loss_mlp": 1.00047398, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 1.7171389169221176, + "language_loss": 0.72255754, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74491519, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 2.5817837715148926 + }, + { + "auxiliary_loss_clip": 0.01113543, + "auxiliary_loss_mlp": 0.01103686, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.0003078, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.5174949916141558, + "language_loss": 0.68265343, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.7048257, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.7754287719726562 + }, + { + "auxiliary_loss_clip": 0.01115208, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00045013, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.7540341788708393, + "language_loss": 0.84478915, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86696714, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.6388745307922363 + }, + { + "auxiliary_loss_clip": 0.01149481, + "auxiliary_loss_mlp": 0.01103194, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00048339, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.6453065267155453, + "language_loss": 0.75975996, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.7822867, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.540156364440918 + }, + { + "auxiliary_loss_clip": 0.01131392, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.00172865, + "balance_loss_mlp": 1.00037181, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.7486182073453738, + "language_loss": 0.76726419, + "learning_rate": 1.52708595287494e-08, + "loss": 0.78960896, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.5528101921081543 + }, + { + "auxiliary_loss_clip": 0.01164907, + "auxiliary_loss_mlp": 0.0074723, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00034666, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.5311002605929058, + "language_loss": 0.67443419, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69355553, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01135463, + "auxiliary_loss_mlp": 0.01102625, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00029588, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6117899844386279, + "language_loss": 0.72676152, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74914241, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 3.965452194213867 + }, + { + "auxiliary_loss_clip": 0.01132977, + "auxiliary_loss_mlp": 0.01101618, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.00043309, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.6858328235697895, + "language_loss": 0.64970708, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67205304, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.62093186378479 + }, + { + "auxiliary_loss_clip": 0.0111812, + "auxiliary_loss_mlp": 0.01102565, + "balance_loss_clip": 1.00166202, + "balance_loss_mlp": 1.00042641, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.5231391835796573, + "language_loss": 0.75599372, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77820051, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 2.635941743850708 + }, + { + "auxiliary_loss_clip": 0.01148047, + "auxiliary_loss_mlp": 0.01102105, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00044346, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.4161677349883477, + "language_loss": 0.68350172, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70600325, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.527554988861084 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.01102809, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00048018, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2806920470983305, + "language_loss": 0.64622116, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66873175, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.5996007919311523 + }, + { + "auxiliary_loss_clip": 0.01095259, + "auxiliary_loss_mlp": 0.01103283, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00057244, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.8151596855639323, + "language_loss": 0.75861371, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78059912, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.6418206691741943 + }, + { + "auxiliary_loss_clip": 0.01148152, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.8533830543985954, + "language_loss": 0.79665244, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81915724, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.5357532501220703 + }, + { + "auxiliary_loss_clip": 0.01148096, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00048888, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 1.968825726198494, + "language_loss": 0.67101157, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69351411, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.8700754642486572 + }, + { + "auxiliary_loss_clip": 0.01133703, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_clip": 1.00176895, + "balance_loss_mlp": 1.00054812, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5107919104324903, + "language_loss": 0.78025508, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80260658, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.576080322265625 + }, + { + "auxiliary_loss_clip": 0.01165139, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00054932, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 1.8629305128022136, + "language_loss": 0.67727876, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.69997042, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 3.999335527420044 + }, + { + "auxiliary_loss_clip": 0.01133039, + "auxiliary_loss_mlp": 0.01103451, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00045419, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 29.109599162595593, + "language_loss": 0.72834414, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75070906, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 4.047034740447998 + }, + { + "auxiliary_loss_clip": 0.01148182, + "auxiliary_loss_mlp": 0.01102492, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00044918, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 1.6765793557849755, + "language_loss": 0.75521493, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77772164, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.536102056503296 + }, + { + "auxiliary_loss_clip": 0.0114844, + "auxiliary_loss_mlp": 0.01104246, + "balance_loss_clip": 1.0019685, + "balance_loss_mlp": 1.00048637, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.8310771415773814, + "language_loss": 0.69568121, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71820807, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.518970251083374 + }, + { + "auxiliary_loss_clip": 0.01148093, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.000422, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.676824233199051, + "language_loss": 0.68501669, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70751846, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.840569496154785 + }, + { + "auxiliary_loss_clip": 0.011359, + "auxiliary_loss_mlp": 0.01104528, + "balance_loss_clip": 1.00189507, + "balance_loss_mlp": 1.00048232, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 2.162196207290207, + "language_loss": 0.72248632, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74489057, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 4.126041650772095 + }, + { + "auxiliary_loss_clip": 0.01129304, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00039101, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.0744467130966826, + "language_loss": 0.63003016, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65235418, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.7638790607452393 + }, + { + "auxiliary_loss_clip": 0.01131322, + "auxiliary_loss_mlp": 0.01101467, + "balance_loss_clip": 1.0017482, + "balance_loss_mlp": 1.0004729, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.8550816509959693, + "language_loss": 0.71477592, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73710382, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.8090741634368896 + }, + { + "auxiliary_loss_clip": 0.0111872, + "auxiliary_loss_mlp": 0.01102158, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00040066, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 2.054494606180261, + "language_loss": 0.77085996, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79306877, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.5903563499450684 + }, + { + "auxiliary_loss_clip": 0.01160441, + "auxiliary_loss_mlp": 0.01079383, + "balance_loss_clip": 1.00121951, + "balance_loss_mlp": 1.00003719, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8101488742124817, + "language_loss": 0.63137639, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65377462, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 2.999504804611206 + }, + { + "auxiliary_loss_clip": 0.01135061, + "auxiliary_loss_mlp": 0.01103103, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.0004878, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.9661861379294623, + "language_loss": 0.66666704, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68904871, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.6450791358947754 + }, + { + "auxiliary_loss_clip": 0.01096041, + "auxiliary_loss_mlp": 0.01102185, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00057077, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.66424845524256, + "language_loss": 0.79577345, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.8177557, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.6564292907714844 + }, + { + "auxiliary_loss_clip": 0.01118621, + "auxiliary_loss_mlp": 0.01102354, + "balance_loss_clip": 1.00164819, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.5469610260792503, + "language_loss": 0.71921647, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74142617, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.7271041870117188 + }, + { + "auxiliary_loss_clip": 0.01131317, + "auxiliary_loss_mlp": 0.01102619, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00038505, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.734076203894156, + "language_loss": 0.76963645, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.7919758, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.6936094760894775 + }, + { + "auxiliary_loss_clip": 0.0111743, + "auxiliary_loss_mlp": 0.01104598, + "balance_loss_clip": 1.00162816, + "balance_loss_mlp": 1.00045657, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.061976931070963, + "language_loss": 0.64953405, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67175436, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.7212610244750977 + }, + { + "auxiliary_loss_clip": 0.01135722, + "auxiliary_loss_mlp": 0.01102312, + "balance_loss_clip": 1.00187337, + "balance_loss_mlp": 1.00055456, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 1.8901611358940056, + "language_loss": 0.73357904, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75595939, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.684591770172119 + }, + { + "auxiliary_loss_clip": 0.0114981, + "auxiliary_loss_mlp": 0.01102379, + "balance_loss_clip": 1.00176322, + "balance_loss_mlp": 1.00043166, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.5003211745015836, + "language_loss": 0.81546229, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.8379842, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.6718194484710693 + }, + { + "auxiliary_loss_clip": 0.01148332, + "auxiliary_loss_mlp": 0.01102929, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.0005039, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.5629430739165266, + "language_loss": 0.81256098, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83507359, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.605377674102783 + }, + { + "auxiliary_loss_clip": 0.0115024, + "auxiliary_loss_mlp": 0.01103152, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.00044107, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.8764742996770147, + "language_loss": 0.76226169, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78479564, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.5757718086242676 + }, + { + "auxiliary_loss_clip": 0.01118515, + "auxiliary_loss_mlp": 0.00747439, + "balance_loss_clip": 1.00166559, + "balance_loss_mlp": 1.00041962, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 2.1122840184330745, + "language_loss": 0.63094127, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.64960074, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.643562078475952 + }, + { + "auxiliary_loss_clip": 0.01150396, + "auxiliary_loss_mlp": 0.01103856, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00057316, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 2.2058206089310697, + "language_loss": 0.87194139, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89448392, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 2.5284712314605713 + }, + { + "auxiliary_loss_clip": 0.01097355, + "auxiliary_loss_mlp": 0.01080103, + "balance_loss_clip": 1.00231504, + "balance_loss_mlp": 0.99999487, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7522891132619888, + "language_loss": 0.53233588, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55411047, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.288386583328247 + }, + { + "auxiliary_loss_clip": 0.01164974, + "auxiliary_loss_mlp": 0.01103096, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00028992, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.5214843742566813, + "language_loss": 0.73994946, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76263011, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 2.6982476711273193 + }, + { + "auxiliary_loss_clip": 0.01134713, + "auxiliary_loss_mlp": 0.01102811, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00057721, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.7222201099660945, + "language_loss": 0.66301119, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68538642, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 2.6552345752716064 + }, + { + "auxiliary_loss_clip": 0.01143562, + "auxiliary_loss_mlp": 0.0074631, + "balance_loss_clip": 1.00103724, + "balance_loss_mlp": 1.00085115, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8338691360177073, + "language_loss": 0.60768819, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62658691, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.1565451622009277 + }, + { + "auxiliary_loss_clip": 0.01147935, + "auxiliary_loss_mlp": 0.01102203, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00054097, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.9566967897494865, + "language_loss": 0.66611087, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68861222, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.57401704788208 + }, + { + "auxiliary_loss_clip": 0.01084907, + "auxiliary_loss_mlp": 0.01102352, + "balance_loss_clip": 1.00162065, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.9390510440000392, + "language_loss": 0.65560335, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67747593, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.700828790664673 + }, + { + "auxiliary_loss_clip": 0.01119325, + "auxiliary_loss_mlp": 0.01103187, + "balance_loss_clip": 1.00174308, + "balance_loss_mlp": 1.00047672, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.31203956023899, + "language_loss": 0.73996639, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76219147, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.6258959770202637 + }, + { + "auxiliary_loss_clip": 0.01165208, + "auxiliary_loss_mlp": 0.01102814, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.0004847, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 2.293232210605331, + "language_loss": 0.81881058, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84149075, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.5247983932495117 + }, + { + "auxiliary_loss_clip": 0.0112923, + "auxiliary_loss_mlp": 0.01102948, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00052297, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.691915996581213, + "language_loss": 0.7005946, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72291636, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 4.225375175476074 + }, + { + "auxiliary_loss_clip": 0.01164844, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_clip": 1.00176036, + "balance_loss_mlp": 1.00043416, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.6545099161007484, + "language_loss": 0.6288085, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65148169, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 2.5504767894744873 + }, + { + "auxiliary_loss_clip": 0.0111858, + "auxiliary_loss_mlp": 0.00747462, + "balance_loss_clip": 1.00169575, + "balance_loss_mlp": 1.00050879, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 2.0030439847777197, + "language_loss": 0.71083069, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.72949111, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 2.678421974182129 + }, + { + "auxiliary_loss_clip": 0.01100601, + "auxiliary_loss_mlp": 0.01102794, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00046515, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.1660299230764566, + "language_loss": 0.7320419, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75407583, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.6742517948150635 + }, + { + "auxiliary_loss_clip": 0.01149852, + "auxiliary_loss_mlp": 0.01103527, + "balance_loss_clip": 1.0018152, + "balance_loss_mlp": 1.00043511, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 1.9473864054119068, + "language_loss": 0.73368907, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75622284, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.507767915725708 + }, + { + "auxiliary_loss_clip": 0.01133564, + "auxiliary_loss_mlp": 0.01101925, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00045419, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.8604698479300623, + "language_loss": 0.71757048, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.73992538, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.593367338180542 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01103528, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00053072, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.8466475354732514, + "language_loss": 0.8107329, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83295858, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.650089979171753 + }, + { + "auxiliary_loss_clip": 0.01133777, + "auxiliary_loss_mlp": 0.01102787, + "balance_loss_clip": 1.00169635, + "balance_loss_mlp": 1.000458, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.5581381456954455, + "language_loss": 0.71599245, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73835808, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.6068830490112305 + }, + { + "auxiliary_loss_clip": 0.0113347, + "auxiliary_loss_mlp": 0.01101422, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00042808, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.8757235127038212, + "language_loss": 0.69976199, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72211093, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.5584511756896973 + }, + { + "auxiliary_loss_clip": 0.01083176, + "auxiliary_loss_mlp": 0.01102646, + "balance_loss_clip": 1.00157094, + "balance_loss_mlp": 1.00041163, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 1.8230109901584752, + "language_loss": 0.75292641, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77478456, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 4.102799415588379 + }, + { + "auxiliary_loss_clip": 0.01149475, + "auxiliary_loss_mlp": 0.01103098, + "balance_loss_clip": 1.00184524, + "balance_loss_mlp": 1.00057817, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.5464990712932085, + "language_loss": 0.62896651, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.65149218, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 3.967144250869751 + }, + { + "auxiliary_loss_clip": 0.0113184, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00058317, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.7079983583004286, + "language_loss": 0.68965971, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71200436, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.5831105709075928 + }, + { + "auxiliary_loss_clip": 0.01148307, + "auxiliary_loss_mlp": 0.01103293, + "balance_loss_clip": 1.00177896, + "balance_loss_mlp": 1.00048721, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.8088940754911782, + "language_loss": 0.63874221, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66125822, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.6418237686157227 + }, + { + "auxiliary_loss_clip": 0.01148367, + "auxiliary_loss_mlp": 0.01102354, + "balance_loss_clip": 1.00195372, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.6285674513813713, + "language_loss": 0.71213597, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73464316, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 3.9646899700164795 + }, + { + "auxiliary_loss_clip": 0.0115046, + "auxiliary_loss_mlp": 0.01103989, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.0005157, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 2.8980991199701074, + "language_loss": 0.69391936, + "learning_rate": 1.278669873970606e-08, + "loss": 0.7164638, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.725621223449707 + }, + { + "auxiliary_loss_clip": 0.01143685, + "auxiliary_loss_mlp": 0.01079376, + "balance_loss_clip": 1.00113654, + "balance_loss_mlp": 1.00003004, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8417494340046223, + "language_loss": 0.59126163, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61349219, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.192260265350342 + }, + { + "auxiliary_loss_clip": 0.01164809, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.00182021, + "balance_loss_mlp": 1.00030684, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.558193782706596, + "language_loss": 0.7429294, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76560575, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.6038999557495117 + }, + { + "auxiliary_loss_clip": 0.01135082, + "auxiliary_loss_mlp": 0.01103138, + "balance_loss_clip": 1.00193751, + "balance_loss_mlp": 1.00042713, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.4375116092182827, + "language_loss": 0.68694097, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70932317, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.584103584289551 + }, + { + "auxiliary_loss_clip": 0.01148609, + "auxiliary_loss_mlp": 0.00747404, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00041008, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.4911345616102967, + "language_loss": 0.61830902, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63726914, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.6299335956573486 + }, + { + "auxiliary_loss_clip": 0.01116599, + "auxiliary_loss_mlp": 0.01102157, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00039983, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.7668288464698922, + "language_loss": 0.76911926, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79130679, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.665562868118286 + }, + { + "auxiliary_loss_clip": 0.01133393, + "auxiliary_loss_mlp": 0.01102768, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5147860207390538, + "language_loss": 0.71630931, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73867083, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.5812036991119385 + }, + { + "auxiliary_loss_clip": 0.01164899, + "auxiliary_loss_mlp": 0.01102705, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.00047076, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.9924842530995996, + "language_loss": 0.71821713, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74089319, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.5220422744750977 + }, + { + "auxiliary_loss_clip": 0.01148073, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00054538, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.527906865407444, + "language_loss": 0.73969793, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76220459, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.594438076019287 + }, + { + "auxiliary_loss_clip": 0.01133262, + "auxiliary_loss_mlp": 0.01103097, + "balance_loss_clip": 1.00178862, + "balance_loss_mlp": 1.00057662, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 3.567077257234305, + "language_loss": 0.73662317, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75898677, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.7656314373016357 + }, + { + "auxiliary_loss_clip": 0.01132525, + "auxiliary_loss_mlp": 0.01102149, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00039184, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.5365943330096445, + "language_loss": 0.76777869, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.79012537, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.702298402786255 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01079415, + "balance_loss_clip": 1.00116265, + "balance_loss_mlp": 1.0000689, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7241383331953896, + "language_loss": 0.64133358, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.6634165, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.2958362102508545 + }, + { + "auxiliary_loss_clip": 0.01101867, + "auxiliary_loss_mlp": 0.0110164, + "balance_loss_clip": 1.00165081, + "balance_loss_mlp": 1.00045538, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.072868882751717, + "language_loss": 0.93546391, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95749903, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.6914052963256836 + }, + { + "auxiliary_loss_clip": 0.0114845, + "auxiliary_loss_mlp": 0.01102747, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00051332, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.9053405190735435, + "language_loss": 0.82493031, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84744227, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.550363302230835 + }, + { + "auxiliary_loss_clip": 0.01150296, + "auxiliary_loss_mlp": 0.00747246, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00047159, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.4720829852583746, + "language_loss": 0.84138846, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86036384, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.6022627353668213 + }, + { + "auxiliary_loss_clip": 0.01131627, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00047469, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.777045023661779, + "language_loss": 0.67324889, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69559228, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.6074838638305664 + }, + { + "auxiliary_loss_clip": 0.01164876, + "auxiliary_loss_mlp": 0.01102317, + "balance_loss_clip": 1.00188148, + "balance_loss_mlp": 1.0004642, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 2.59343354319791, + "language_loss": 0.82268572, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84535766, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 2.4900906085968018 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.00168896, + "balance_loss_mlp": 1.00033319, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.7193640123424956, + "language_loss": 0.68956208, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.7119087, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.6189627647399902 + }, + { + "auxiliary_loss_clip": 0.01150259, + "auxiliary_loss_mlp": 0.01100982, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.00056052, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.7223321130857505, + "language_loss": 0.67757809, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70009053, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.543678045272827 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00040269, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.8071305800739157, + "language_loss": 0.89052182, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91286623, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 2.58687162399292 + }, + { + "auxiliary_loss_clip": 0.01165112, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00053632, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 2.1361144745207286, + "language_loss": 0.76882845, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79151011, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.4932658672332764 + }, + { + "auxiliary_loss_clip": 0.0113496, + "auxiliary_loss_mlp": 0.01102702, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00046849, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.6922929609337085, + "language_loss": 0.66330516, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68568176, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.571383476257324 + }, + { + "auxiliary_loss_clip": 0.01150444, + "auxiliary_loss_mlp": 0.01102309, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00045633, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.6459433386766003, + "language_loss": 0.77916968, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.80169719, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.553942918777466 + }, + { + "auxiliary_loss_clip": 0.0116508, + "auxiliary_loss_mlp": 0.01103489, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00049222, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 4.1018679846084085, + "language_loss": 0.76063609, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.7833218, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.485807180404663 + }, + { + "auxiliary_loss_clip": 0.01133885, + "auxiliary_loss_mlp": 0.01102596, + "balance_loss_clip": 1.00168777, + "balance_loss_mlp": 1.00045729, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.6645730537808938, + "language_loss": 0.75882053, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.78118527, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 4.131295442581177 + }, + { + "auxiliary_loss_clip": 0.0111652, + "auxiliary_loss_mlp": 0.01102403, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00055075, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.8525512509273443, + "language_loss": 0.78888905, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81107831, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 2.5987939834594727 + }, + { + "auxiliary_loss_clip": 0.01132744, + "auxiliary_loss_mlp": 0.01103527, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00062525, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.635067857218968, + "language_loss": 0.72081649, + "learning_rate": 1.166897413780532e-08, + "loss": 0.7431792, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.562875270843506 + }, + { + "auxiliary_loss_clip": 0.01150237, + "auxiliary_loss_mlp": 0.01103385, + "balance_loss_clip": 1.00187814, + "balance_loss_mlp": 1.00048351, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 2.1112266380150135, + "language_loss": 0.59713972, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61967587, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.5974130630493164 + }, + { + "auxiliary_loss_clip": 0.01148592, + "auxiliary_loss_mlp": 0.01103311, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00050521, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.9980213416326709, + "language_loss": 0.72034538, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74286437, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.5561389923095703 + }, + { + "auxiliary_loss_clip": 0.01147958, + "auxiliary_loss_mlp": 0.01101951, + "balance_loss_clip": 1.00171065, + "balance_loss_mlp": 1.00047994, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.5239202018455136, + "language_loss": 0.72141129, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74391037, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.577059745788574 + }, + { + "auxiliary_loss_clip": 0.01135415, + "auxiliary_loss_mlp": 0.01102805, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00038075, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 2.6106795176866555, + "language_loss": 0.73880517, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76118743, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.5711874961853027 + }, + { + "auxiliary_loss_clip": 0.01133743, + "auxiliary_loss_mlp": 0.01102289, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00034118, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.6866902085430078, + "language_loss": 0.67596829, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69832873, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.641770362854004 + }, + { + "auxiliary_loss_clip": 0.01119956, + "auxiliary_loss_mlp": 0.01102414, + "balance_loss_clip": 1.00181186, + "balance_loss_mlp": 1.00046682, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.384419526898358, + "language_loss": 0.76956546, + "learning_rate": 1.141827483932789e-08, + "loss": 0.79178923, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.6757047176361084 + }, + { + "auxiliary_loss_clip": 0.0110042, + "auxiliary_loss_mlp": 0.01102658, + "balance_loss_clip": 1.00167632, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.9551523237063018, + "language_loss": 0.79184598, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81387675, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 4.0560619831085205 + }, + { + "auxiliary_loss_clip": 0.01149893, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00039136, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.1513746349700242, + "language_loss": 0.67871529, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.70124817, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 4.003101587295532 + }, + { + "auxiliary_loss_clip": 0.0113164, + "auxiliary_loss_mlp": 0.01103819, + "balance_loss_clip": 1.00180411, + "balance_loss_mlp": 1.00034547, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 1.9764655262876967, + "language_loss": 0.68557668, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.70793128, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.6274518966674805 + }, + { + "auxiliary_loss_clip": 0.01150366, + "auxiliary_loss_mlp": 0.01102591, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00035703, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 2.755582149781384, + "language_loss": 0.78448153, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80701101, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.5253658294677734 + }, + { + "auxiliary_loss_clip": 0.0113328, + "auxiliary_loss_mlp": 0.01102388, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00034523, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.8118367634073629, + "language_loss": 0.71111226, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73346901, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 4.042266130447388 + }, + { + "auxiliary_loss_clip": 0.01164968, + "auxiliary_loss_mlp": 0.00747386, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00041556, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.6953213153769155, + "language_loss": 0.70490569, + "learning_rate": 1.117029020040916e-08, + "loss": 0.7240293, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.5602264404296875 + }, + { + "auxiliary_loss_clip": 0.0116513, + "auxiliary_loss_mlp": 0.01104511, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00046492, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 1.930446908520003, + "language_loss": 0.75040632, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.7731027, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.5077197551727295 + }, + { + "auxiliary_loss_clip": 0.01131554, + "auxiliary_loss_mlp": 0.01103815, + "balance_loss_clip": 1.00169647, + "balance_loss_mlp": 1.00043714, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.4986322858625027, + "language_loss": 0.69082499, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71317863, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01164732, + "auxiliary_loss_mlp": 0.01103466, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00037396, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 2.7077282731429326, + "language_loss": 0.77043003, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79311204, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 2.528536796569824 + }, + { + "auxiliary_loss_clip": 0.01164946, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00045931, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.8064958412001328, + "language_loss": 0.76181233, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78448397, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.4661436080932617 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.00175703, + "balance_loss_mlp": 1.00037384, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.6347976472444294, + "language_loss": 0.6896646, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71200341, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.630645751953125 + }, + { + "auxiliary_loss_clip": 0.0114839, + "auxiliary_loss_mlp": 0.01103371, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00046992, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.4998755227254006, + "language_loss": 0.75915813, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.7816757, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.551084518432617 + }, + { + "auxiliary_loss_clip": 0.01165146, + "auxiliary_loss_mlp": 0.01103375, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00056887, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 1.8666307036211132, + "language_loss": 0.70880157, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.7314868, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.5179388523101807 + }, + { + "auxiliary_loss_clip": 0.01132741, + "auxiliary_loss_mlp": 0.01104232, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 1.6369310265371582, + "language_loss": 0.71409166, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.7364614, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 2.79054856300354 + }, + { + "auxiliary_loss_clip": 0.01164934, + "auxiliary_loss_mlp": 0.01102341, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.000489, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 2.072803780489882, + "language_loss": 0.78164673, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80431938, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.532788038253784 + }, + { + "auxiliary_loss_clip": 0.01129111, + "auxiliary_loss_mlp": 0.01103032, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00041676, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 2.046146901931835, + "language_loss": 0.90496999, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92729139, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.565122604370117 + }, + { + "auxiliary_loss_clip": 0.01148321, + "auxiliary_loss_mlp": 0.01103974, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00040531, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.5931817825689087, + "language_loss": 0.66235089, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68487382, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.671038866043091 + }, + { + "auxiliary_loss_clip": 0.0111543, + "auxiliary_loss_mlp": 0.01103327, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.00052118, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.519773756050263, + "language_loss": 0.73376429, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75595188, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.610133647918701 + }, + { + "auxiliary_loss_clip": 0.01131591, + "auxiliary_loss_mlp": 0.01102814, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00038934, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.5493359354650793, + "language_loss": 0.73510259, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75744659, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.604518413543701 + }, + { + "auxiliary_loss_clip": 0.01114635, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00042152, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.6713375397124008, + "language_loss": 0.77145422, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79363382, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.614636182785034 + }, + { + "auxiliary_loss_clip": 0.01135352, + "auxiliary_loss_mlp": 0.01102642, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.5362282220297145, + "language_loss": 0.80211091, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.8244909, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.613981246948242 + }, + { + "auxiliary_loss_clip": 0.01150379, + "auxiliary_loss_mlp": 0.01101722, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.0005374, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.4092991487545454, + "language_loss": 0.77753234, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80005336, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.585453987121582 + }, + { + "auxiliary_loss_clip": 0.01124844, + "auxiliary_loss_mlp": 0.01079315, + "balance_loss_clip": 1.00125623, + "balance_loss_mlp": 0.99996966, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8148291419672897, + "language_loss": 0.5672394, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58928102, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.2159788608551025 + }, + { + "auxiliary_loss_clip": 0.01116206, + "auxiliary_loss_mlp": 0.01079364, + "balance_loss_clip": 1.00113547, + "balance_loss_mlp": 1.000018, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8763290134742076, + "language_loss": 0.61567736, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63763309, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 3.0752575397491455 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.00060344, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.036161769365961, + "language_loss": 0.73834538, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76087487, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.5786428451538086 + }, + { + "auxiliary_loss_clip": 0.0115057, + "auxiliary_loss_mlp": 0.0110306, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00044465, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.0972396109476055, + "language_loss": 0.57202542, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.5945617, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.5719425678253174 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01079378, + "balance_loss_clip": 1.00112796, + "balance_loss_mlp": 1.00003195, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6632940711151546, + "language_loss": 0.54278857, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56504047, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.0632808208465576 + }, + { + "auxiliary_loss_clip": 0.01086007, + "auxiliary_loss_mlp": 0.01104293, + "balance_loss_clip": 1.00171697, + "balance_loss_mlp": 1.00043774, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 7.855946600966143, + "language_loss": 0.62542284, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64732581, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.804619312286377 + }, + { + "auxiliary_loss_clip": 0.01131391, + "auxiliary_loss_mlp": 0.01101694, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00050879, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 1.9387642892204908, + "language_loss": 0.75000966, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77234048, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 3.9923572540283203 + }, + { + "auxiliary_loss_clip": 0.01149227, + "auxiliary_loss_mlp": 0.01101693, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 1.7971743282196875, + "language_loss": 0.66712832, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68963748, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.511648654937744 + }, + { + "auxiliary_loss_clip": 0.01143211, + "auxiliary_loss_mlp": 0.01078982, + "balance_loss_clip": 1.00105548, + "balance_loss_mlp": 1.00001812, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6942640688977703, + "language_loss": 0.56582797, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58804989, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.0934951305389404 + }, + { + "auxiliary_loss_clip": 0.01133422, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.0006144, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.9657618721600585, + "language_loss": 0.82469034, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84706163, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.518868923187256 + }, + { + "auxiliary_loss_clip": 0.01148001, + "auxiliary_loss_mlp": 0.01101111, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00049841, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.4819829606273098, + "language_loss": 0.7189284, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74141955, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.537276268005371 + }, + { + "auxiliary_loss_clip": 0.01117275, + "auxiliary_loss_mlp": 0.01102103, + "balance_loss_clip": 1.00164187, + "balance_loss_mlp": 1.00044155, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 1.9336383522463914, + "language_loss": 0.75796402, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.7801578, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.5909788608551025 + }, + { + "auxiliary_loss_clip": 0.01164947, + "auxiliary_loss_mlp": 0.01102789, + "balance_loss_clip": 1.0018276, + "balance_loss_mlp": 1.00045991, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.93754801839179, + "language_loss": 0.77563435, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79831171, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01135725, + "auxiliary_loss_mlp": 0.00747288, + "balance_loss_clip": 1.00197291, + "balance_loss_mlp": 1.00046837, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.6905912761606687, + "language_loss": 0.77897525, + "learning_rate": 9.971098618001272e-09, + "loss": 0.79780543, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.595062494277954 + }, + { + "auxiliary_loss_clip": 0.01101433, + "auxiliary_loss_mlp": 0.01102018, + "balance_loss_clip": 1.00173461, + "balance_loss_mlp": 1.00045121, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.40235525838461, + "language_loss": 0.75698221, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77901667, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.6913485527038574 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01102156, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00039852, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 2.5851462923583344, + "language_loss": 0.69407511, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71659988, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.5172271728515625 + }, + { + "auxiliary_loss_clip": 0.01147807, + "auxiliary_loss_mlp": 0.0110266, + "balance_loss_clip": 1.00180829, + "balance_loss_mlp": 1.00052142, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 2.499032453599302, + "language_loss": 0.69086945, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71337408, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 4.00043797492981 + }, + { + "auxiliary_loss_clip": 0.0111843, + "auxiliary_loss_mlp": 0.01102376, + "balance_loss_clip": 1.00167096, + "balance_loss_mlp": 1.00033247, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.143980218639076, + "language_loss": 0.75707662, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77928472, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.6013081073760986 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01102898, + "balance_loss_clip": 1.0015763, + "balance_loss_mlp": 1.00037837, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.4803902876773403, + "language_loss": 0.74554002, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76773357, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.6369857788085938 + }, + { + "auxiliary_loss_clip": 0.01148378, + "auxiliary_loss_mlp": 0.01103926, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00054812, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 1.9644634975609947, + "language_loss": 0.74462569, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76714879, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.56046724319458 + }, + { + "auxiliary_loss_clip": 0.01143817, + "auxiliary_loss_mlp": 0.01079777, + "balance_loss_clip": 1.00115705, + "balance_loss_mlp": 1.00004959, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.9558345738106367, + "language_loss": 0.61476153, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63699746, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 4.484436511993408 + }, + { + "auxiliary_loss_clip": 0.01148097, + "auxiliary_loss_mlp": 0.01101923, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00064278, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.6470966667589468, + "language_loss": 0.74928278, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77178299, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.537045478820801 + }, + { + "auxiliary_loss_clip": 0.01099128, + "auxiliary_loss_mlp": 0.01103263, + "balance_loss_clip": 1.00154495, + "balance_loss_mlp": 1.0005523, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.5466398510456407, + "language_loss": 0.69315898, + "learning_rate": 9.62458290188839e-09, + "loss": 0.7151829, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.656900405883789 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01102967, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00054228, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.4629670623986313, + "language_loss": 0.64939791, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67157722, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.734400749206543 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.01104218, + "balance_loss_clip": 1.00169742, + "balance_loss_mlp": 1.00045848, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 1.9185562356804449, + "language_loss": 0.6315394, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65374053, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.673260450363159 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01104144, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00047922, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.4205124073721986, + "language_loss": 0.69604242, + "learning_rate": 9.510436165056867e-09, + "loss": 0.7185899, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 2.5053789615631104 + }, + { + "auxiliary_loss_clip": 0.01165064, + "auxiliary_loss_mlp": 0.0074736, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.0003829, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 1.834819537183281, + "language_loss": 0.76417637, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78330064, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.528426170349121 + }, + { + "auxiliary_loss_clip": 0.01119089, + "auxiliary_loss_mlp": 0.0110371, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00071299, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 4.986051223328192, + "language_loss": 0.78912079, + "learning_rate": 9.434715735916477e-09, + "loss": 0.8113488, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.5717859268188477 + }, + { + "auxiliary_loss_clip": 0.01131876, + "auxiliary_loss_mlp": 0.01102173, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00060654, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.6245968309254888, + "language_loss": 0.64642745, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66876793, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 2.5962557792663574 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.01102457, + "balance_loss_clip": 1.00168991, + "balance_loss_mlp": 1.00050938, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 1.9213864735761361, + "language_loss": 0.81215566, + "learning_rate": 9.359297236513519e-09, + "loss": 0.8345145, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.5889482498168945 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01103947, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.00047374, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 1.8209993840750438, + "language_loss": 0.7263577, + "learning_rate": 9.321701214040079e-09, + "loss": 0.74888086, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.5829389095306396 + }, + { + "auxiliary_loss_clip": 0.01164801, + "auxiliary_loss_mlp": 0.01102044, + "balance_loss_clip": 1.0017879, + "balance_loss_mlp": 1.00047767, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.5472216242178278, + "language_loss": 0.758811, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78147948, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.5145585536956787 + }, + { + "auxiliary_loss_clip": 0.01098569, + "auxiliary_loss_mlp": 0.01080219, + "balance_loss_clip": 1.00175405, + "balance_loss_mlp": 1.00010991, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.768251476270292, + "language_loss": 0.54915029, + "learning_rate": 9.246735630678015e-09, + "loss": 0.57093811, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.3277194499969482 + }, + { + "auxiliary_loss_clip": 0.01133586, + "auxiliary_loss_mlp": 0.01103061, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00054097, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.7532054896043725, + "language_loss": 0.70548809, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72785461, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.7005226612091064 + }, + { + "auxiliary_loss_clip": 0.01148376, + "auxiliary_loss_mlp": 0.01103061, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.0004456, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.4246873830304358, + "language_loss": 0.72135901, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74387336, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.6057000160217285 + }, + { + "auxiliary_loss_clip": 0.01148352, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00041127, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.3301081361540312, + "language_loss": 0.68415475, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70311129, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.533276081085205 + }, + { + "auxiliary_loss_clip": 0.01148088, + "auxiliary_loss_mlp": 0.01102474, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00043082, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 2.2184161220906216, + "language_loss": 0.68679768, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70930326, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.5793097019195557 + }, + { + "auxiliary_loss_clip": 0.0110257, + "auxiliary_loss_mlp": 0.01102945, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00052047, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.6795223261313355, + "language_loss": 0.55476022, + "learning_rate": 9.060642764378457e-09, + "loss": 0.57681537, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.6834957599639893 + }, + { + "auxiliary_loss_clip": 0.0114796, + "auxiliary_loss_mlp": 0.01102945, + "balance_loss_clip": 1.001809, + "balance_loss_mlp": 1.00042462, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.2105922296747473, + "language_loss": 0.67893422, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70144325, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.5861005783081055 + }, + { + "auxiliary_loss_clip": 0.0114823, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_clip": 1.00187027, + "balance_loss_mlp": 1.00067782, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.9359668786775037, + "language_loss": 0.72453129, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74703979, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.6736044883728027 + }, + { + "auxiliary_loss_clip": 0.01131436, + "auxiliary_loss_mlp": 0.0110288, + "balance_loss_clip": 1.0016768, + "balance_loss_mlp": 1.00036049, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 3.2295700276189376, + "language_loss": 0.80274284, + "learning_rate": 8.949892992753395e-09, + "loss": 0.825086, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 2.5411903858184814 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_clip": 1.00113678, + "balance_loss_mlp": 0.99997395, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.76584128596732, + "language_loss": 0.54568565, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56757867, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 3.2200045585632324 + }, + { + "auxiliary_loss_clip": 0.01121209, + "auxiliary_loss_mlp": 0.00747465, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00034881, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 2.3747184071873657, + "language_loss": 0.61397511, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63266182, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.681206464767456 + }, + { + "auxiliary_loss_clip": 0.01116136, + "auxiliary_loss_mlp": 0.01102144, + "balance_loss_clip": 1.00157738, + "balance_loss_mlp": 1.00048256, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.6507192628236165, + "language_loss": 0.73895687, + "learning_rate": 8.839822728487155e-09, + "loss": 0.76113975, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 4.0206334590911865 + }, + { + "auxiliary_loss_clip": 0.01148072, + "auxiliary_loss_mlp": 0.01102265, + "balance_loss_clip": 1.00167918, + "balance_loss_mlp": 1.00050807, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 3.1884768438891107, + "language_loss": 0.75269961, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77520299, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 2.7315609455108643 + }, + { + "auxiliary_loss_clip": 0.01130998, + "auxiliary_loss_mlp": 0.01103996, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00042701, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.2948933860964664, + "language_loss": 0.74360633, + "learning_rate": 8.766820074958214e-09, + "loss": 0.76595628, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.5555286407470703 + }, + { + "auxiliary_loss_clip": 0.01148217, + "auxiliary_loss_mlp": 0.01102562, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00042367, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 2.8726523011297935, + "language_loss": 0.7479077, + "learning_rate": 8.730432009145027e-09, + "loss": 0.77041554, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.5518851280212402 + }, + { + "auxiliary_loss_clip": 0.01116201, + "auxiliary_loss_mlp": 0.01103414, + "balance_loss_clip": 1.00157285, + "balance_loss_mlp": 1.00041711, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.8453210381918996, + "language_loss": 0.67435551, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69655156, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.6151373386383057 + }, + { + "auxiliary_loss_clip": 0.01082221, + "auxiliary_loss_mlp": 0.0110244, + "balance_loss_clip": 1.00162601, + "balance_loss_mlp": 1.00049198, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 2.0071038634068867, + "language_loss": 0.70262712, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72447371, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.7524664402008057 + }, + { + "auxiliary_loss_clip": 0.01085667, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_clip": 1.00172293, + "balance_loss_mlp": 1.0005002, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.8226300901252703, + "language_loss": 0.80785114, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82973897, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.696382522583008 + }, + { + "auxiliary_loss_clip": 0.01148642, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00169325, + "balance_loss_mlp": 1.00038218, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 1.9177876961816518, + "language_loss": 0.67318958, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69215107, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.5345451831817627 + }, + { + "auxiliary_loss_clip": 0.01148376, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00180078, + "balance_loss_mlp": 1.0005548, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.441174994253354, + "language_loss": 0.91125393, + "learning_rate": 8.54962434469919e-09, + "loss": 0.93376839, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.6250829696655273 + }, + { + "auxiliary_loss_clip": 0.01112243, + "auxiliary_loss_mlp": 0.00747253, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.000314, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.75832566590071, + "language_loss": 0.72595596, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74455094, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.6082663536071777 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 1.950948426784274, + "language_loss": 0.59978628, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62182713, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.622105121612549 + }, + { + "auxiliary_loss_clip": 0.01164814, + "auxiliary_loss_mlp": 0.01101618, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00043273, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.7143581290870775, + "language_loss": 0.78694773, + "learning_rate": 8.44204592704112e-09, + "loss": 0.80961204, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 4.087400436401367 + }, + { + "auxiliary_loss_clip": 0.01160378, + "auxiliary_loss_mlp": 0.01079369, + "balance_loss_clip": 1.00115657, + "balance_loss_mlp": 1.00002289, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7834096853919897, + "language_loss": 0.5433616, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56575906, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.1090340614318848 + }, + { + "auxiliary_loss_clip": 0.01148308, + "auxiliary_loss_mlp": 0.00747188, + "balance_loss_clip": 1.00188172, + "balance_loss_mlp": 1.00042462, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.62712261931647, + "language_loss": 0.71811312, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73706806, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.5337557792663574 + }, + { + "auxiliary_loss_clip": 0.01116774, + "auxiliary_loss_mlp": 0.01102185, + "balance_loss_clip": 1.00165498, + "balance_loss_mlp": 1.00042784, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.8174097042186876, + "language_loss": 0.78806829, + "learning_rate": 8.335147190060787e-09, + "loss": 0.81025791, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.662916421890259 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01102632, + "balance_loss_clip": 1.0015676, + "balance_loss_mlp": 1.00039828, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.7719408592043515, + "language_loss": 0.72811431, + "learning_rate": 8.299665324196903e-09, + "loss": 0.75047743, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 4.1890904903411865 + }, + { + "auxiliary_loss_clip": 0.0108514, + "auxiliary_loss_mlp": 0.01103956, + "balance_loss_clip": 1.00169849, + "balance_loss_mlp": 1.00048256, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 6.715478868674515, + "language_loss": 0.83587885, + "learning_rate": 8.264258983809114e-09, + "loss": 0.85776979, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.7049849033355713 + }, + { + "auxiliary_loss_clip": 0.01117915, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.00037098, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.4910805249116117, + "language_loss": 0.79076076, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81295836, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.6326465606689453 + }, + { + "auxiliary_loss_clip": 0.01133017, + "auxiliary_loss_mlp": 0.01102923, + "balance_loss_clip": 1.00189149, + "balance_loss_mlp": 1.00040281, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.8173182640977337, + "language_loss": 0.70674253, + "learning_rate": 8.193672884830195e-09, + "loss": 0.7291019, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.5611624717712402 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.01103164, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.43285293862063, + "language_loss": 0.75561464, + "learning_rate": 8.158493128915812e-09, + "loss": 0.7779637, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.645087242126465 + }, + { + "auxiliary_loss_clip": 0.01101904, + "auxiliary_loss_mlp": 0.01103569, + "balance_loss_clip": 1.0015837, + "balance_loss_mlp": 1.00047731, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.0400530039234996, + "language_loss": 0.72643399, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74848878, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.6441075801849365 + }, + { + "auxiliary_loss_clip": 0.01118771, + "auxiliary_loss_mlp": 0.01103517, + "balance_loss_clip": 1.00169611, + "balance_loss_mlp": 1.0004251, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.8832552556661228, + "language_loss": 0.57391238, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59613526, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.660418748855591 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.01103085, + "balance_loss_clip": 1.00163746, + "balance_loss_mlp": 1.00037456, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 2.2141079498068663, + "language_loss": 0.71501505, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73720419, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.623645782470703 + }, + { + "auxiliary_loss_clip": 0.01118833, + "auxiliary_loss_mlp": 0.01102572, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00052929, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.7620050072234643, + "language_loss": 0.68646538, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70867944, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.5939242839813232 + }, + { + "auxiliary_loss_clip": 0.01150269, + "auxiliary_loss_mlp": 0.01102488, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00034988, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.8059111472110219, + "language_loss": 0.86204994, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88457745, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.579674005508423 + }, + { + "auxiliary_loss_clip": 0.01102551, + "auxiliary_loss_mlp": 0.01104021, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00045156, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 3.8037197041918827, + "language_loss": 0.64314365, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66520941, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.663485050201416 + }, + { + "auxiliary_loss_clip": 0.01150209, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00045729, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.4876924164605863, + "language_loss": 0.78024411, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80276835, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.5949501991271973 + }, + { + "auxiliary_loss_clip": 0.01148676, + "auxiliary_loss_mlp": 0.01102734, + "balance_loss_clip": 1.00197685, + "balance_loss_mlp": 1.00040507, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.227510978275609, + "language_loss": 0.57155502, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59406912, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.5197980403900146 + }, + { + "auxiliary_loss_clip": 0.01131644, + "auxiliary_loss_mlp": 0.01102964, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00044405, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.0762981597361705, + "language_loss": 0.72249073, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74483681, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.619575023651123 + }, + { + "auxiliary_loss_clip": 0.01132651, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_clip": 1.00163448, + "balance_loss_mlp": 1.00052595, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.779251455332006, + "language_loss": 0.68365312, + "learning_rate": 7.810849984090984e-09, + "loss": 0.7060082, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.6217429637908936 + }, + { + "auxiliary_loss_clip": 0.01087467, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_clip": 1.00150561, + "balance_loss_mlp": 1.00039256, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.6896085200968254, + "language_loss": 0.67445374, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69635749, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.746539354324341 + }, + { + "auxiliary_loss_clip": 0.01131186, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00043774, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1.7332035359934155, + "language_loss": 0.77196646, + "learning_rate": 7.742227841308624e-09, + "loss": 0.79075247, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.6140739917755127 + }, + { + "auxiliary_loss_clip": 0.01148361, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_clip": 1.0017947, + "balance_loss_mlp": 1.00042379, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.4757017736768687, + "language_loss": 0.76698899, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78950787, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.6249709129333496 + }, + { + "auxiliary_loss_clip": 0.01164863, + "auxiliary_loss_mlp": 0.01102388, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00043988, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.288225559270936, + "language_loss": 0.62975252, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65242505, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.489769458770752 + }, + { + "auxiliary_loss_clip": 0.01055123, + "auxiliary_loss_mlp": 0.01103229, + "balance_loss_clip": 1.00147212, + "balance_loss_mlp": 1.0005188, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.9335662174653545, + "language_loss": 0.62555432, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64713788, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 2.9230878353118896 + }, + { + "auxiliary_loss_clip": 0.01134494, + "auxiliary_loss_mlp": 0.01102645, + "balance_loss_clip": 1.00174391, + "balance_loss_mlp": 1.0005064, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.5294474998039156, + "language_loss": 0.77754962, + "learning_rate": 7.605890125470527e-09, + "loss": 0.79992098, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.919847249984741 + }, + { + "auxiliary_loss_clip": 0.01118934, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.00044823, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.1673892201668603, + "language_loss": 0.79745311, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81966448, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 2.589603900909424 + }, + { + "auxiliary_loss_clip": 0.01118977, + "auxiliary_loss_mlp": 0.01102155, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00039816, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.9938430498244033, + "language_loss": 0.77497292, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79718423, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 4.2202067375183105 + }, + { + "auxiliary_loss_clip": 0.01132599, + "auxiliary_loss_mlp": 0.01102625, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.704576527053498, + "language_loss": 0.65300483, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67535704, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 2.722029209136963 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01102869, + "balance_loss_clip": 1.00176024, + "balance_loss_mlp": 1.00044465, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.76118218090448, + "language_loss": 0.80504572, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82742178, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 2.668011426925659 + }, + { + "auxiliary_loss_clip": 0.0111714, + "auxiliary_loss_mlp": 0.01101441, + "balance_loss_clip": 1.00162601, + "balance_loss_mlp": 1.00035119, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.604322729936979, + "language_loss": 0.78048235, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80266815, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.6378121376037598 + }, + { + "auxiliary_loss_clip": 0.01150138, + "auxiliary_loss_mlp": 0.01102599, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.00036478, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.7737271145458495, + "language_loss": 0.50911379, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53164119, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.6899571418762207 + }, + { + "auxiliary_loss_clip": 0.01148593, + "auxiliary_loss_mlp": 0.01102908, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00038803, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.770662416486782, + "language_loss": 0.8091988, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83171386, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.5353567600250244 + }, + { + "auxiliary_loss_clip": 0.0115019, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00037146, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.6106245335319294, + "language_loss": 0.82838237, + "learning_rate": 7.336841261255111e-09, + "loss": 0.85091507, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.5111443996429443 + }, + { + "auxiliary_loss_clip": 0.01084879, + "auxiliary_loss_mlp": 0.01103701, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00051391, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.9544613699280524, + "language_loss": 0.75191581, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77380162, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 2.66448712348938 + }, + { + "auxiliary_loss_clip": 0.01131696, + "auxiliary_loss_mlp": 0.01101698, + "balance_loss_clip": 1.00160837, + "balance_loss_mlp": 1.00051343, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.6420667495264134, + "language_loss": 0.8546043, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87693828, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.5782310962677 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00052404, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.4975650809554069, + "language_loss": 0.76053011, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78271794, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.573011875152588 + }, + { + "auxiliary_loss_clip": 0.01115758, + "auxiliary_loss_mlp": 0.01079341, + "balance_loss_clip": 1.00114584, + "balance_loss_mlp": 0.99999505, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7092802413819317, + "language_loss": 0.52504683, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54699779, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.1580021381378174 + }, + { + "auxiliary_loss_clip": 0.01131884, + "auxiliary_loss_mlp": 0.01102603, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00046432, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.5560615073134938, + "language_loss": 0.76212716, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78447205, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 4.081562042236328 + }, + { + "auxiliary_loss_clip": 0.01165048, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00054586, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 2.063671763610132, + "language_loss": 0.67600155, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69869518, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 3.958714008331299 + }, + { + "auxiliary_loss_clip": 0.01164859, + "auxiliary_loss_mlp": 0.01103436, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.00053477, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.662445147392128, + "language_loss": 0.7763347, + "learning_rate": 7.10539048654768e-09, + "loss": 0.79901767, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.5167431831359863 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.01103295, + "balance_loss_clip": 1.00168419, + "balance_loss_mlp": 1.000489, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.7290253399767748, + "language_loss": 0.79138601, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81374872, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.5747644901275635 + }, + { + "auxiliary_loss_clip": 0.01083929, + "auxiliary_loss_mlp": 0.01103924, + "balance_loss_clip": 1.00162864, + "balance_loss_mlp": 1.00054538, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 1.8021993997820742, + "language_loss": 0.684111, + "learning_rate": 7.039941811905592e-09, + "loss": 0.7059896, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 4.134926080703735 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01101665, + "balance_loss_clip": 1.00155997, + "balance_loss_mlp": 1.00038457, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.3698060093185862, + "language_loss": 0.72686714, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74905038, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.709143877029419 + }, + { + "auxiliary_loss_clip": 0.01133193, + "auxiliary_loss_mlp": 0.01102788, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00045848, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 1.960387724697434, + "language_loss": 0.72988892, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75224876, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.6636791229248047 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01102406, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.9063487622431958, + "language_loss": 0.7745896, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79726303, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.54714298248291 + }, + { + "auxiliary_loss_clip": 0.01133598, + "auxiliary_loss_mlp": 0.01103778, + "balance_loss_clip": 1.00181472, + "balance_loss_mlp": 1.00049472, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.9996907356495908, + "language_loss": 0.79047012, + "learning_rate": 6.909951351435905e-09, + "loss": 0.8128438, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.592759847640991 + }, + { + "auxiliary_loss_clip": 0.01164832, + "auxiliary_loss_mlp": 0.01101846, + "balance_loss_clip": 1.00180387, + "balance_loss_mlp": 1.0004704, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.5265381638485938, + "language_loss": 0.74364513, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76631188, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.563098192214966 + }, + { + "auxiliary_loss_clip": 0.01086504, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_clip": 1.00156271, + "balance_loss_mlp": 1.00041866, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.7694904204570343, + "language_loss": 0.84269607, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86458766, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.6658084392547607 + }, + { + "auxiliary_loss_clip": 0.01148425, + "auxiliary_loss_mlp": 0.01101995, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00052357, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.7819291449045669, + "language_loss": 0.70710218, + "learning_rate": 6.813252072591425e-09, + "loss": 0.72960639, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.593656063079834 + }, + { + "auxiliary_loss_clip": 0.01116607, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_clip": 1.00171828, + "balance_loss_mlp": 1.00043702, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.9786338921392908, + "language_loss": 0.77353168, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79571015, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.590132474899292 + }, + { + "auxiliary_loss_clip": 0.01116096, + "auxiliary_loss_mlp": 0.00747401, + "balance_loss_clip": 1.00157189, + "balance_loss_mlp": 1.00040507, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.5268969617625658, + "language_loss": 0.78685606, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80549103, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 2.6355950832366943 + }, + { + "auxiliary_loss_clip": 0.01135392, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00190556, + "balance_loss_mlp": 1.00047517, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.2936702833894844, + "language_loss": 0.78085887, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80323994, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.609707832336426 + }, + { + "auxiliary_loss_clip": 0.0111735, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_clip": 1.00159383, + "balance_loss_mlp": 1.00052214, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 1.8925268125362549, + "language_loss": 0.78464806, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80685675, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.6118645668029785 + }, + { + "auxiliary_loss_clip": 0.01134004, + "auxiliary_loss_mlp": 0.01101904, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00052881, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.7050152010832247, + "language_loss": 0.80138195, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82374108, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.6367948055267334 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_clip": 1.0015192, + "balance_loss_mlp": 1.00030744, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 1.668381746196006, + "language_loss": 0.66104019, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68305659, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.6439731121063232 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01101941, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.0004698, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.713707168399978, + "language_loss": 0.7427696, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76511991, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.5812571048736572 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.01102898, + "balance_loss_clip": 1.00168777, + "balance_loss_mlp": 1.00056922, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.6808949100715347, + "language_loss": 0.66858983, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69062483, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.780898332595825 + }, + { + "auxiliary_loss_clip": 0.01088977, + "auxiliary_loss_mlp": 0.01103603, + "balance_loss_clip": 1.00172353, + "balance_loss_mlp": 1.00041533, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.641281944649869, + "language_loss": 0.71544999, + "learning_rate": 6.527235786226937e-09, + "loss": 0.7373758, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.688532829284668 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01102955, + "balance_loss_clip": 1.00178528, + "balance_loss_mlp": 1.00043464, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6300430863393653, + "language_loss": 0.78710997, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80929852, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.670760154724121 + }, + { + "auxiliary_loss_clip": 0.01131393, + "auxiliary_loss_mlp": 0.01102317, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00036871, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.151892388518219, + "language_loss": 0.77259988, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79493701, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.567974805831909 + }, + { + "auxiliary_loss_clip": 0.01132845, + "auxiliary_loss_mlp": 0.01102811, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00048161, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.7418352611655317, + "language_loss": 0.81153393, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83389044, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.5897600650787354 + }, + { + "auxiliary_loss_clip": 0.01164753, + "auxiliary_loss_mlp": 0.01102235, + "balance_loss_clip": 1.00177145, + "balance_loss_mlp": 1.00047767, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 2.765565507305885, + "language_loss": 0.75311804, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77578795, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.486572027206421 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_clip": 1.00169921, + "balance_loss_mlp": 1.00041032, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.5840515522159366, + "language_loss": 0.66654396, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68872797, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 2.6811258792877197 + }, + { + "auxiliary_loss_clip": 0.01148237, + "auxiliary_loss_mlp": 0.01102323, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00047112, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.6667206256230545, + "language_loss": 0.87951201, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90201759, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 4.022390842437744 + }, + { + "auxiliary_loss_clip": 0.01083284, + "auxiliary_loss_mlp": 0.01102817, + "balance_loss_clip": 1.00160182, + "balance_loss_mlp": 1.00058341, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.9433790233871122, + "language_loss": 0.7480318, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76989281, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 2.6896610260009766 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01079031, + "balance_loss_clip": 1.00109231, + "balance_loss_mlp": 1.0000664, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8076120700763091, + "language_loss": 0.59148669, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61356801, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.132636308670044 + }, + { + "auxiliary_loss_clip": 0.01114883, + "auxiliary_loss_mlp": 0.00747198, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00046492, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.6007257048231427, + "language_loss": 0.68878812, + "learning_rate": 6.247342505960818e-09, + "loss": 0.70740891, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 2.684758186340332 + }, + { + "auxiliary_loss_clip": 0.01150264, + "auxiliary_loss_mlp": 0.01103575, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00067353, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.6493117684585783, + "language_loss": 0.82837868, + "learning_rate": 6.216621253462894e-09, + "loss": 0.8509171, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 2.5059316158294678 + }, + { + "auxiliary_loss_clip": 0.01164903, + "auxiliary_loss_mlp": 0.01101614, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00052404, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.718567557251253, + "language_loss": 0.77960265, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80226779, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 2.5063796043395996 + }, + { + "auxiliary_loss_clip": 0.01145582, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_clip": 1.00115824, + "balance_loss_mlp": 0.9999941, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8377156102090173, + "language_loss": 0.55792505, + "learning_rate": 6.155405563025962e-09, + "loss": 0.58017427, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 3.0222904682159424 + }, + { + "auxiliary_loss_clip": 0.01150199, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00045371, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.5457246401527314, + "language_loss": 0.74954855, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77207458, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.5818231105804443 + }, + { + "auxiliary_loss_clip": 0.01131493, + "auxiliary_loss_mlp": 0.01101915, + "balance_loss_clip": 1.00173593, + "balance_loss_mlp": 1.00034904, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.7790304583061223, + "language_loss": 0.72344476, + "learning_rate": 6.094492299733245e-09, + "loss": 0.7457788, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.6102852821350098 + }, + { + "auxiliary_loss_clip": 0.01131263, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.794697091518873, + "language_loss": 0.7674154, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78976434, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 2.594980001449585 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01078988, + "balance_loss_clip": 1.00145459, + "balance_loss_mlp": 1.00002337, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7554985660809556, + "language_loss": 0.53889883, + "learning_rate": 6.033881472824465e-09, + "loss": 0.56098723, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 2.998457193374634 + }, + { + "auxiliary_loss_clip": 0.01164762, + "auxiliary_loss_mlp": 0.01101993, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00033188, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.7995725916991137, + "language_loss": 0.71453857, + "learning_rate": 6.003689475888807e-09, + "loss": 0.7372061, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 3.9077820777893066 + }, + { + "auxiliary_loss_clip": 0.01147927, + "auxiliary_loss_mlp": 0.01103812, + "balance_loss_clip": 1.00173247, + "balance_loss_mlp": 1.00052869, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 6.020331637697532, + "language_loss": 0.78653461, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80905199, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 3.9544546604156494 + }, + { + "auxiliary_loss_clip": 0.01145677, + "auxiliary_loss_mlp": 0.01103493, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00049663, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.714543961833622, + "language_loss": 0.76933277, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79182446, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.5624613761901855 + }, + { + "auxiliary_loss_clip": 0.01149844, + "auxiliary_loss_mlp": 0.01102556, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00041699, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.6519085607539683, + "language_loss": 0.75513595, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77765995, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.5370802879333496 + }, + { + "auxiliary_loss_clip": 0.01102133, + "auxiliary_loss_mlp": 0.0110254, + "balance_loss_clip": 1.00163054, + "balance_loss_mlp": 1.00049639, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.5485042658724735, + "language_loss": 0.72453475, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74658144, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 4.147377967834473 + }, + { + "auxiliary_loss_clip": 0.01132678, + "auxiliary_loss_mlp": 0.00747399, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.00040019, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 1.9536117154263404, + "language_loss": 0.83700293, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85580373, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.652508497238159 + }, + { + "auxiliary_loss_clip": 0.01120914, + "auxiliary_loss_mlp": 0.01103644, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.0005517, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.522096653977407, + "language_loss": 0.59934944, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62159503, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.6383917331695557 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01102665, + "balance_loss_clip": 1.00176871, + "balance_loss_mlp": 1.00052595, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.1515033708506457, + "language_loss": 0.82476902, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84696954, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.637079954147339 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01102107, + "balance_loss_clip": 1.00180137, + "balance_loss_mlp": 1.00054073, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.693738117976163, + "language_loss": 0.83387464, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85606599, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.6153769493103027 + }, + { + "auxiliary_loss_clip": 0.01148319, + "auxiliary_loss_mlp": 0.01102908, + "balance_loss_clip": 1.00191605, + "balance_loss_mlp": 1.00048387, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5729027771759718, + "language_loss": 0.75288689, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77539921, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.553617238998413 + }, + { + "auxiliary_loss_clip": 0.01150197, + "auxiliary_loss_mlp": 0.01103771, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00048852, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.5444094162522843, + "language_loss": 0.69956136, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72210097, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.5359857082366943 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01103134, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00042295, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 2.0719064774076634, + "language_loss": 0.83931297, + "learning_rate": 5.676568187055197e-09, + "loss": 0.86167604, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.6595382690429688 + }, + { + "auxiliary_loss_clip": 0.01100424, + "auxiliary_loss_mlp": 0.01101847, + "balance_loss_clip": 1.00167787, + "balance_loss_mlp": 1.00047112, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.3647718999659808, + "language_loss": 0.78221565, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80423838, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.6518688201904297 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01101266, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00046217, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4330077085117179, + "language_loss": 0.74016452, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76282382, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 2.4606881141662598 + }, + { + "auxiliary_loss_clip": 0.01088982, + "auxiliary_loss_mlp": 0.01103193, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00038755, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.4672917740755465, + "language_loss": 0.7973761, + "learning_rate": 5.58894135118404e-09, + "loss": 0.81929785, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.7529211044311523 + }, + { + "auxiliary_loss_clip": 0.0108376, + "auxiliary_loss_mlp": 0.01103984, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00060534, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.8809485925576366, + "language_loss": 0.79436398, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81624144, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.7218306064605713 + }, + { + "auxiliary_loss_clip": 0.0114764, + "auxiliary_loss_mlp": 0.01102179, + "balance_loss_clip": 1.00177777, + "balance_loss_mlp": 1.00051725, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 1.8794516432594015, + "language_loss": 0.66803539, + "learning_rate": 5.530901600093507e-09, + "loss": 0.69053358, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.559278964996338 + }, + { + "auxiliary_loss_clip": 0.01160405, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_clip": 1.0011698, + "balance_loss_mlp": 0.99996626, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7775472492130655, + "language_loss": 0.59913337, + "learning_rate": 5.501995169700846e-09, + "loss": 0.62153059, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.1252715587615967 + }, + { + "auxiliary_loss_clip": 0.01150114, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.000422, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.67878863760372, + "language_loss": 0.78845608, + "learning_rate": 5.473164370872307e-09, + "loss": 0.81098378, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.5520355701446533 + }, + { + "auxiliary_loss_clip": 0.0115048, + "auxiliary_loss_mlp": 0.01103289, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00048327, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.6896181880584487, + "language_loss": 0.65110469, + "learning_rate": 5.444409204701461e-09, + "loss": 0.6736424, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.574084758758545 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00053787, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.1768887664119676, + "language_loss": 0.7663874, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78893268, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.5424234867095947 + }, + { + "auxiliary_loss_clip": 0.01149802, + "auxiliary_loss_mlp": 0.01103894, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.00061071, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.669182669410219, + "language_loss": 0.64068937, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66322637, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.7124359607696533 + }, + { + "auxiliary_loss_clip": 0.01116343, + "auxiliary_loss_mlp": 0.00747398, + "balance_loss_clip": 1.00164175, + "balance_loss_mlp": 1.00037241, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.560313086882439, + "language_loss": 0.75357854, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77221596, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.6896111965179443 + }, + { + "auxiliary_loss_clip": 0.01164823, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00047874, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 1.8822834518697995, + "language_loss": 0.7829743, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80565065, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.556903839111328 + }, + { + "auxiliary_loss_clip": 0.01148585, + "auxiliary_loss_mlp": 0.01102974, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.5722003069938404, + "language_loss": 0.75445044, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77696604, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.574028253555298 + }, + { + "auxiliary_loss_clip": 0.01145163, + "auxiliary_loss_mlp": 0.01079001, + "balance_loss_clip": 1.00122166, + "balance_loss_mlp": 1.00003707, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6766094592950279, + "language_loss": 0.59805906, + "learning_rate": 5.273466554344353e-09, + "loss": 0.62030077, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.198436975479126 + }, + { + "auxiliary_loss_clip": 0.01134874, + "auxiliary_loss_mlp": 0.01103838, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00045991, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.592431682978685, + "language_loss": 0.73466414, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75705129, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 4.064363241195679 + }, + { + "auxiliary_loss_clip": 0.01148635, + "auxiliary_loss_mlp": 0.0110292, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00049496, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.8639376573605502, + "language_loss": 0.79165864, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81417418, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.5479736328125 + }, + { + "auxiliary_loss_clip": 0.01150367, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00032663, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.2307955891237183, + "language_loss": 0.74329519, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76582539, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 2.539989471435547 + }, + { + "auxiliary_loss_clip": 0.01148333, + "auxiliary_loss_mlp": 0.01103021, + "balance_loss_clip": 1.00188661, + "balance_loss_mlp": 1.00050068, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 4.870776103162688, + "language_loss": 0.70044595, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72295946, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 2.620805263519287 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00040901, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.5314000883634975, + "language_loss": 0.66440952, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68693078, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 2.5608599185943604 + }, + { + "auxiliary_loss_clip": 0.01098576, + "auxiliary_loss_mlp": 0.01104029, + "balance_loss_clip": 1.00175607, + "balance_loss_mlp": 1.00046027, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.9094044336925746, + "language_loss": 0.7268002, + "learning_rate": 5.105246951967679e-09, + "loss": 0.74882615, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.685312509536743 + }, + { + "auxiliary_loss_clip": 0.0115009, + "auxiliary_loss_mlp": 0.01102831, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.738530610914247, + "language_loss": 0.68963879, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71216798, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 2.627018451690674 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01101456, + "balance_loss_clip": 1.00164878, + "balance_loss_mlp": 1.00055718, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.742425417503187, + "language_loss": 0.86834216, + "learning_rate": 5.049778912747049e-09, + "loss": 0.89036703, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.7535643577575684 + }, + { + "auxiliary_loss_clip": 0.01068073, + "auxiliary_loss_mlp": 0.01102353, + "balance_loss_clip": 1.001436, + "balance_loss_mlp": 1.00030959, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 6.115831205456375, + "language_loss": 0.70141852, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72312278, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.8582332134246826 + }, + { + "auxiliary_loss_clip": 0.01135233, + "auxiliary_loss_mlp": 0.01103244, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.0005331, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.496044606074476, + "language_loss": 0.73758936, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75997412, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.61315655708313 + }, + { + "auxiliary_loss_clip": 0.0113498, + "auxiliary_loss_mlp": 0.01102765, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00043559, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.7548919819001498, + "language_loss": 0.70763898, + "learning_rate": 4.967144221869501e-09, + "loss": 0.73001647, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.7003209590911865 + }, + { + "auxiliary_loss_clip": 0.01165079, + "auxiliary_loss_mlp": 0.01103489, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00049162, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 2.713002468969358, + "language_loss": 0.63985431, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66254008, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.657715082168579 + }, + { + "auxiliary_loss_clip": 0.01131508, + "auxiliary_loss_mlp": 0.01102884, + "balance_loss_clip": 1.0019753, + "balance_loss_mlp": 1.00045943, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.4229724909652985, + "language_loss": 0.70339417, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72573805, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 5.560126304626465 + }, + { + "auxiliary_loss_clip": 0.01084218, + "auxiliary_loss_mlp": 0.01103283, + "balance_loss_clip": 1.00166619, + "balance_loss_mlp": 1.00057232, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.804164847372697, + "language_loss": 0.66698003, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68885505, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.7370760440826416 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01102557, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00032306, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.5694503895605438, + "language_loss": 0.74200141, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76419449, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.668557643890381 + }, + { + "auxiliary_loss_clip": 0.01133646, + "auxiliary_loss_mlp": 0.01101362, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00051093, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.5413958359007378, + "language_loss": 0.7763378, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79868788, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.611607074737549 + }, + { + "auxiliary_loss_clip": 0.01067738, + "auxiliary_loss_mlp": 0.0110295, + "balance_loss_clip": 1.00161123, + "balance_loss_mlp": 1.00042975, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.7471456930352687, + "language_loss": 0.70581353, + "learning_rate": 4.803917467869567e-09, + "loss": 0.72752047, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 4.338299989700317 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.01100918, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00054407, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 4.63467414121266, + "language_loss": 0.85685337, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87919939, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 2.7464871406555176 + }, + { + "auxiliary_loss_clip": 0.01150359, + "auxiliary_loss_mlp": 0.01102455, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00041163, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.5781261166887404, + "language_loss": 0.70547855, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.72800672, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.5635712146759033 + }, + { + "auxiliary_loss_clip": 0.01145403, + "auxiliary_loss_mlp": 0.01102318, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00037003, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.7892484549545766, + "language_loss": 0.84548599, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86796319, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.5929181575775146 + }, + { + "auxiliary_loss_clip": 0.01150148, + "auxiliary_loss_mlp": 0.01103186, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00047565, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 2.0358806106925473, + "language_loss": 0.79045904, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81299239, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.5577731132507324 + }, + { + "auxiliary_loss_clip": 0.01112197, + "auxiliary_loss_mlp": 0.01102499, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00064659, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.670916097990014, + "language_loss": 0.79342121, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81556815, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.6338980197906494 + }, + { + "auxiliary_loss_clip": 0.01148128, + "auxiliary_loss_mlp": 0.01103235, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00061989, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.5829534228321083, + "language_loss": 0.80141819, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82393181, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.5830914974212646 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01101579, + "balance_loss_clip": 1.00165653, + "balance_loss_mlp": 1.00048947, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 3.520665084106837, + "language_loss": 0.83608544, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85841489, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.560588836669922 + }, + { + "auxiliary_loss_clip": 0.01148509, + "auxiliary_loss_mlp": 0.01102789, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00045943, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.654939623934737, + "language_loss": 0.72034991, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74286294, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.515519618988037 + }, + { + "auxiliary_loss_clip": 0.01148202, + "auxiliary_loss_mlp": 0.01101423, + "balance_loss_clip": 1.00186622, + "balance_loss_mlp": 1.00042892, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.89350598866648, + "language_loss": 0.64179516, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66429138, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.5353379249572754 + }, + { + "auxiliary_loss_clip": 0.01131381, + "auxiliary_loss_mlp": 0.01102123, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.00041413, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.5055246348070503, + "language_loss": 0.71050167, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73283672, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.6209096908569336 + }, + { + "auxiliary_loss_clip": 0.01148295, + "auxiliary_loss_mlp": 0.01101471, + "balance_loss_clip": 1.0015831, + "balance_loss_mlp": 1.00047672, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.3906491695300462, + "language_loss": 0.582322, + "learning_rate": 4.511742602582691e-09, + "loss": 0.6048196, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.564016819000244 + }, + { + "auxiliary_loss_clip": 0.01148229, + "auxiliary_loss_mlp": 0.01102295, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00044227, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.701199979354643, + "language_loss": 0.81536043, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83786571, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.5723636150360107 + }, + { + "auxiliary_loss_clip": 0.0113345, + "auxiliary_loss_mlp": 0.00747439, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00034297, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.4092879464817645, + "language_loss": 0.7148267, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73363566, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.6426992416381836 + }, + { + "auxiliary_loss_clip": 0.01116142, + "auxiliary_loss_mlp": 0.01102877, + "balance_loss_clip": 1.00160348, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.90946777577285, + "language_loss": 0.75379908, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77598929, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.586400270462036 + }, + { + "auxiliary_loss_clip": 0.01148341, + "auxiliary_loss_mlp": 0.0110295, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00042987, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.5676920708053477, + "language_loss": 0.67053461, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69304752, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.543957471847534 + }, + { + "auxiliary_loss_clip": 0.01165011, + "auxiliary_loss_mlp": 0.00747423, + "balance_loss_clip": 1.0019201, + "balance_loss_mlp": 1.00039339, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 2.0422528082093248, + "language_loss": 0.62698328, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64610767, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.604404926300049 + }, + { + "auxiliary_loss_clip": 0.01114222, + "auxiliary_loss_mlp": 0.01103391, + "balance_loss_clip": 1.00166655, + "balance_loss_mlp": 1.00039458, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 1.5839278508327468, + "language_loss": 0.73600507, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75818121, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.6266138553619385 + }, + { + "auxiliary_loss_clip": 0.01149909, + "auxiliary_loss_mlp": 0.01103688, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.00040555, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 1.6252824335399745, + "language_loss": 0.84326744, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86580336, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.6041886806488037 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01101023, + "balance_loss_clip": 1.00157881, + "balance_loss_mlp": 1.0004108, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.3237950315010307, + "language_loss": 0.71669924, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73887414, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.602182149887085 + }, + { + "auxiliary_loss_clip": 0.01150505, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_clip": 1.00195587, + "balance_loss_mlp": 1.00055039, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.8430542075481184, + "language_loss": 0.80734158, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.8298735, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.5596776008605957 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01102753, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.00051892, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.6520732785395797, + "language_loss": 0.75553429, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77789587, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 4.189762115478516 + }, + { + "auxiliary_loss_clip": 0.01150312, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00047696, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.6604679729677323, + "language_loss": 0.78458786, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80711901, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.627507448196411 + }, + { + "auxiliary_loss_clip": 0.01148112, + "auxiliary_loss_mlp": 0.01102154, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00039721, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.4380661025087997, + "language_loss": 0.72768879, + "learning_rate": 4.203448764984019e-09, + "loss": 0.75019145, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.5541648864746094 + }, + { + "auxiliary_loss_clip": 0.01133577, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00042462, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 1.9060327234669638, + "language_loss": 0.88747239, + "learning_rate": 4.178249514071419e-09, + "loss": 0.90982902, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.607440710067749 + }, + { + "auxiliary_loss_clip": 0.01148327, + "auxiliary_loss_mlp": 0.01103449, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.0004518, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.386579436910668, + "language_loss": 0.77913707, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80165482, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.5465850830078125 + }, + { + "auxiliary_loss_clip": 0.01131565, + "auxiliary_loss_mlp": 0.01102596, + "balance_loss_clip": 1.00176346, + "balance_loss_mlp": 1.00045764, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.237795874299197, + "language_loss": 0.74792445, + "learning_rate": 4.128078058480921e-09, + "loss": 0.770266, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 2.5720677375793457 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01102887, + "balance_loss_clip": 1.0018748, + "balance_loss_mlp": 1.00046277, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.6372258294935558, + "language_loss": 0.79714674, + "learning_rate": 4.103105855705724e-09, + "loss": 0.8194645, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.5934934616088867 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01103126, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00060582, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.375506587155297, + "language_loss": 0.83051455, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85273683, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.579873561859131 + }, + { + "auxiliary_loss_clip": 0.01116628, + "auxiliary_loss_mlp": 0.01101676, + "balance_loss_clip": 1.00167382, + "balance_loss_mlp": 1.00039613, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.9849257115614163, + "language_loss": 0.70368284, + "learning_rate": 4.053388504930089e-09, + "loss": 0.7258659, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.635089159011841 + }, + { + "auxiliary_loss_clip": 0.01133678, + "auxiliary_loss_mlp": 0.0110405, + "balance_loss_clip": 1.00183666, + "balance_loss_mlp": 1.00048137, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 1.90361320543687, + "language_loss": 0.7199384, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74231577, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 2.563291311264038 + }, + { + "auxiliary_loss_clip": 0.0113539, + "auxiliary_loss_mlp": 0.0110183, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00035894, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5587155905943784, + "language_loss": 0.73622262, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75859481, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.579833745956421 + }, + { + "auxiliary_loss_clip": 0.01117256, + "auxiliary_loss_mlp": 0.01100812, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.00039053, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.3888400370960212, + "language_loss": 0.74753177, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76971245, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.6464741230010986 + }, + { + "auxiliary_loss_clip": 0.01128773, + "auxiliary_loss_mlp": 0.01079365, + "balance_loss_clip": 1.00117433, + "balance_loss_mlp": 1.00001931, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.749582770889272, + "language_loss": 0.57794517, + "learning_rate": 3.954862048811902e-09, + "loss": 0.60002655, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 3.055997848510742 + }, + { + "auxiliary_loss_clip": 0.01100188, + "auxiliary_loss_mlp": 0.01102374, + "balance_loss_clip": 1.00163841, + "balance_loss_mlp": 1.00052166, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.6945654007342508, + "language_loss": 0.66587114, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68789679, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 5.539604187011719 + }, + { + "auxiliary_loss_clip": 0.01126926, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_clip": 1.00123167, + "balance_loss_mlp": 1.00002098, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.9003778557126175, + "language_loss": 0.54532945, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56738865, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.210167169570923 + }, + { + "auxiliary_loss_clip": 0.01150426, + "auxiliary_loss_mlp": 0.01102699, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00036931, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.6055472924917322, + "language_loss": 0.79764628, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82017756, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.5959818363189697 + }, + { + "auxiliary_loss_clip": 0.01133257, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_clip": 1.00179756, + "balance_loss_mlp": 1.00045288, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.747868572931597, + "language_loss": 0.63143027, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65377927, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.574490785598755 + }, + { + "auxiliary_loss_clip": 0.01148305, + "auxiliary_loss_mlp": 0.01102858, + "balance_loss_clip": 1.00185525, + "balance_loss_mlp": 1.00043309, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 2.2207702522729877, + "language_loss": 0.7266376, + "learning_rate": 3.833407015731316e-09, + "loss": 0.7491492, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 3.998702049255371 + }, + { + "auxiliary_loss_clip": 0.01112021, + "auxiliary_loss_mlp": 0.01079368, + "balance_loss_clip": 1.0010066, + "balance_loss_mlp": 1.00002241, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.689464465721208, + "language_loss": 0.51674867, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53866255, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.18648362159729 + }, + { + "auxiliary_loss_clip": 0.0114831, + "auxiliary_loss_mlp": 0.01102221, + "balance_loss_clip": 1.00180805, + "balance_loss_mlp": 1.00055969, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.4285343421020045, + "language_loss": 0.69565845, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71816379, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.5934946537017822 + }, + { + "auxiliary_loss_clip": 0.01164944, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00037682, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.6710012668714431, + "language_loss": 0.55598426, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57865793, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 2.6295390129089355 + }, + { + "auxiliary_loss_clip": 0.01106432, + "auxiliary_loss_mlp": 0.01102407, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.0005548, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.6044228557822366, + "language_loss": 0.73204243, + "learning_rate": 3.737605490767404e-09, + "loss": 0.7541309, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.667370319366455 + }, + { + "auxiliary_loss_clip": 0.01131325, + "auxiliary_loss_mlp": 0.01101788, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00041211, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.543028155553316, + "language_loss": 0.81810009, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84043127, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.5515077114105225 + }, + { + "auxiliary_loss_clip": 0.01145766, + "auxiliary_loss_mlp": 0.01079313, + "balance_loss_clip": 1.00115335, + "balance_loss_mlp": 0.9999668, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7482988205907789, + "language_loss": 0.53577238, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55802315, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 2.973024845123291 + }, + { + "auxiliary_loss_clip": 0.01102572, + "auxiliary_loss_mlp": 0.01103714, + "balance_loss_clip": 1.00176597, + "balance_loss_mlp": 1.00062132, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.6266689429291226, + "language_loss": 0.73426902, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75633192, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.688027858734131 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01102549, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00041068, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.615476923281979, + "language_loss": 0.78684223, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.8092227, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.5934948921203613 + }, + { + "auxiliary_loss_clip": 0.01148391, + "auxiliary_loss_mlp": 0.01102263, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00050592, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.6140061365308986, + "language_loss": 0.80765033, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83015692, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.5931077003479004 + }, + { + "auxiliary_loss_clip": 0.0116504, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00054991, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 1.9791820900413766, + "language_loss": 0.84696949, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86966014, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.4683420658111572 + }, + { + "auxiliary_loss_clip": 0.0113172, + "auxiliary_loss_mlp": 0.01103303, + "balance_loss_clip": 1.00173128, + "balance_loss_mlp": 1.00040174, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.4487395976843305, + "language_loss": 0.74460316, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76695335, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.6849029064178467 + }, + { + "auxiliary_loss_clip": 0.01099429, + "auxiliary_loss_mlp": 0.01101779, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.00049889, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.5738779412923731, + "language_loss": 0.76434195, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78635406, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.658756732940674 + }, + { + "auxiliary_loss_clip": 0.01119041, + "auxiliary_loss_mlp": 0.01103789, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00041103, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.6547676221523693, + "language_loss": 0.67377704, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69600534, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.6167352199554443 + }, + { + "auxiliary_loss_clip": 0.01150322, + "auxiliary_loss_mlp": 0.01103889, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.0006063, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.443353203926026, + "language_loss": 0.73572314, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75826532, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.6299359798431396 + }, + { + "auxiliary_loss_clip": 0.01135777, + "auxiliary_loss_mlp": 0.01104268, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00050783, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.6601711721270127, + "language_loss": 0.8085115, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83091187, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.6108052730560303 + }, + { + "auxiliary_loss_clip": 0.01165103, + "auxiliary_loss_mlp": 0.01103144, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00052857, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.0064689046836484, + "language_loss": 0.75874013, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78142262, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.5194380283355713 + }, + { + "auxiliary_loss_clip": 0.01165288, + "auxiliary_loss_mlp": 0.0110492, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.0003978, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.4747452888546477, + "language_loss": 0.66649377, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68919587, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.5597598552703857 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.01102582, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.00044394, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.7392477795852403, + "language_loss": 0.7334224, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75593078, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.5636205673217773 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_clip": 1.0019182, + "balance_loss_mlp": 1.00037289, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 2.111480365660179, + "language_loss": 0.76737607, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78988332, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.527454137802124 + }, + { + "auxiliary_loss_clip": 0.01148485, + "auxiliary_loss_mlp": 0.00747348, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00037706, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.0493814063684574, + "language_loss": 0.72899652, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74795485, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.5006964206695557 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.01103497, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00050032, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.7649613185726933, + "language_loss": 0.7820155, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80405545, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 4.067234992980957 + }, + { + "auxiliary_loss_clip": 0.01133234, + "auxiliary_loss_mlp": 0.01104298, + "balance_loss_clip": 1.00176275, + "balance_loss_mlp": 1.00063348, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.42371330672581, + "language_loss": 0.63900423, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66137958, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.6904983520507812 + }, + { + "auxiliary_loss_clip": 0.01116283, + "auxiliary_loss_mlp": 0.01103094, + "balance_loss_clip": 1.00148773, + "balance_loss_mlp": 1.00066972, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.8109842183475913, + "language_loss": 0.73040235, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75259614, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.5891549587249756 + }, + { + "auxiliary_loss_clip": 0.01145208, + "auxiliary_loss_mlp": 0.0110233, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00038195, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 2.0373804312101944, + "language_loss": 0.72723782, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.74971318, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.5593667030334473 + }, + { + "auxiliary_loss_clip": 0.01082912, + "auxiliary_loss_mlp": 0.0110155, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00041294, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.5263298521878688, + "language_loss": 0.81627655, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83812118, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.7121427059173584 + }, + { + "auxiliary_loss_clip": 0.01103837, + "auxiliary_loss_mlp": 0.01102058, + "balance_loss_clip": 1.00168788, + "balance_loss_mlp": 1.0004921, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.936679355629646, + "language_loss": 0.62907785, + "learning_rate": 3.232348386403405e-09, + "loss": 0.65113682, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 2.6460208892822266 + }, + { + "auxiliary_loss_clip": 0.01165121, + "auxiliary_loss_mlp": 0.01103377, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00047565, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 1.8560825135083168, + "language_loss": 0.85743421, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.8801192, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.473639726638794 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.0110152, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00043082, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.3728970440490786, + "language_loss": 0.67252791, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69489276, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 2.672842264175415 + }, + { + "auxiliary_loss_clip": 0.01164941, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00034845, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.6592663056854222, + "language_loss": 0.77380729, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79648161, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.5197181701660156 + }, + { + "auxiliary_loss_clip": 0.01114745, + "auxiliary_loss_mlp": 0.01102191, + "balance_loss_clip": 1.00161421, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.5004074277376267, + "language_loss": 0.75543284, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.7776022, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.702711582183838 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00045681, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 1.929297925921004, + "language_loss": 0.6670543, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68944103, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.613234043121338 + }, + { + "auxiliary_loss_clip": 0.01148132, + "auxiliary_loss_mlp": 0.01101125, + "balance_loss_clip": 1.00180709, + "balance_loss_mlp": 1.00041699, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.3993101704440727, + "language_loss": 0.79414809, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81664073, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 2.5301513671875 + }, + { + "auxiliary_loss_clip": 0.01148644, + "auxiliary_loss_mlp": 0.01103909, + "balance_loss_clip": 1.00169563, + "balance_loss_mlp": 1.00053048, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.206846298163585, + "language_loss": 0.75196099, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77448648, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 3.984025239944458 + }, + { + "auxiliary_loss_clip": 0.01071686, + "auxiliary_loss_mlp": 0.01100919, + "balance_loss_clip": 1.00158572, + "balance_loss_mlp": 1.00049746, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.6924784605690424, + "language_loss": 0.66605306, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68777907, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.866431951522827 + }, + { + "auxiliary_loss_clip": 0.0113298, + "auxiliary_loss_mlp": 0.01102394, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00044608, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 6.340521938774739, + "language_loss": 0.69042885, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71278262, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.600917100906372 + }, + { + "auxiliary_loss_clip": 0.01128822, + "auxiliary_loss_mlp": 0.01100448, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00040781, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.7005245678380776, + "language_loss": 0.75735343, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.77964604, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.546809196472168 + }, + { + "auxiliary_loss_clip": 0.01119594, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.00165832, + "balance_loss_mlp": 1.00038433, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.162253145345589, + "language_loss": 0.84020543, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86243045, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.6265852451324463 + }, + { + "auxiliary_loss_clip": 0.0111736, + "auxiliary_loss_mlp": 0.01102655, + "balance_loss_clip": 1.00159395, + "balance_loss_mlp": 1.00032592, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.8271376830225676, + "language_loss": 0.68756759, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70976776, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 4.202759742736816 + }, + { + "auxiliary_loss_clip": 0.01145742, + "auxiliary_loss_mlp": 0.0074732, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00037408, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.3962461828174961, + "language_loss": 0.66257966, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68151021, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.579033374786377 + }, + { + "auxiliary_loss_clip": 0.0113133, + "auxiliary_loss_mlp": 0.01101386, + "balance_loss_clip": 1.00172246, + "balance_loss_mlp": 1.00039172, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.4859357361099141, + "language_loss": 0.74322999, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76555705, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.611631155014038 + }, + { + "auxiliary_loss_clip": 0.01149284, + "auxiliary_loss_mlp": 0.01102098, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00043607, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 2.0084497387742384, + "language_loss": 0.77586585, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79837966, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.5899746417999268 + }, + { + "auxiliary_loss_clip": 0.01148127, + "auxiliary_loss_mlp": 0.01102819, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00048995, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.7260367725670889, + "language_loss": 0.73317218, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75568163, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 2.578186511993408 + }, + { + "auxiliary_loss_clip": 0.01133515, + "auxiliary_loss_mlp": 0.01102464, + "balance_loss_clip": 1.00172675, + "balance_loss_mlp": 1.00042081, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.6758417628306348, + "language_loss": 0.75837815, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.780738, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.606947422027588 + }, + { + "auxiliary_loss_clip": 0.01133513, + "auxiliary_loss_mlp": 0.01102641, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00040674, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 1.9562740078150864, + "language_loss": 0.80229259, + "learning_rate": 2.846214118442436e-09, + "loss": 0.8246541, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.5711755752563477 + }, + { + "auxiliary_loss_clip": 0.01150134, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.00180662, + "balance_loss_mlp": 1.00036502, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.3254942016343643, + "language_loss": 0.67465711, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69718915, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.5757594108581543 + }, + { + "auxiliary_loss_clip": 0.0116493, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00035596, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.8001200780259592, + "language_loss": 0.69812578, + "learning_rate": 2.804824870920264e-09, + "loss": 0.72079426, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 2.508657693862915 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01103549, + "balance_loss_clip": 1.00183558, + "balance_loss_mlp": 1.00045705, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.7592198732386928, + "language_loss": 0.84124088, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86375475, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.59574031829834 + }, + { + "auxiliary_loss_clip": 0.01164954, + "auxiliary_loss_mlp": 0.01102219, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00036669, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.6380016456909183, + "language_loss": 0.75753522, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78020704, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.522101640701294 + }, + { + "auxiliary_loss_clip": 0.01165043, + "auxiliary_loss_mlp": 0.01103421, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00042415, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.6484968030515499, + "language_loss": 0.70980716, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73249179, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.492217779159546 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01101135, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00042677, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.8216180635345638, + "language_loss": 0.62991381, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65225691, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.557366371154785 + }, + { + "auxiliary_loss_clip": 0.0109951, + "auxiliary_loss_mlp": 0.01101997, + "balance_loss_clip": 1.00158703, + "balance_loss_mlp": 1.00043035, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 2.473976294292051, + "language_loss": 0.75454295, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77655798, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.662489175796509 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.01102651, + "balance_loss_clip": 1.00170231, + "balance_loss_mlp": 1.00041759, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.7028214780892816, + "language_loss": 0.76054984, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78272974, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.664001703262329 + }, + { + "auxiliary_loss_clip": 0.01164835, + "auxiliary_loss_mlp": 0.01102488, + "balance_loss_clip": 1.00183308, + "balance_loss_mlp": 1.00044441, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 2.5527277591023902, + "language_loss": 0.77478218, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79745543, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.5142128467559814 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.01102367, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00051439, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.450799908347979, + "language_loss": 0.61796176, + "learning_rate": 2.642297296540974e-09, + "loss": 0.64032108, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 2.591513156890869 + }, + { + "auxiliary_loss_clip": 0.01149692, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00059414, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.5866639681049566, + "language_loss": 0.65425611, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67677462, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.5492615699768066 + }, + { + "auxiliary_loss_clip": 0.01148172, + "auxiliary_loss_mlp": 0.00747326, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00041461, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.787351920184866, + "language_loss": 0.6864208, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70537579, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.596407890319824 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.011028, + "balance_loss_clip": 1.0017662, + "balance_loss_mlp": 1.0005666, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.729189906650227, + "language_loss": 0.72988927, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75256622, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.01145901, + "auxiliary_loss_mlp": 0.01078925, + "balance_loss_clip": 1.00115585, + "balance_loss_mlp": 0.99996036, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7743897778900497, + "language_loss": 0.65197122, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67421949, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 4.564627647399902 + }, + { + "auxiliary_loss_clip": 0.01148153, + "auxiliary_loss_mlp": 0.01102522, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 1.00038314, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.7041082240922683, + "language_loss": 0.70661259, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.7291193, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.5242557525634766 + }, + { + "auxiliary_loss_clip": 0.01164863, + "auxiliary_loss_mlp": 0.01102271, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00051439, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.7326918479000428, + "language_loss": 0.81473374, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83740509, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.547233819961548 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01101734, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00050151, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.885329302168072, + "language_loss": 0.69460177, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71660674, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.657414197921753 + }, + { + "auxiliary_loss_clip": 0.01135266, + "auxiliary_loss_mlp": 0.01103278, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00047207, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.6422463234280946, + "language_loss": 0.8073197, + "learning_rate": 2.484617081468521e-09, + "loss": 0.82970512, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 2.633951425552368 + }, + { + "auxiliary_loss_clip": 0.01164887, + "auxiliary_loss_mlp": 0.01102239, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00038683, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.4463785488983285, + "language_loss": 0.62369955, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64637077, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.572028398513794 + }, + { + "auxiliary_loss_clip": 0.01118062, + "auxiliary_loss_mlp": 0.01102697, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.00055861, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 2.733801925667264, + "language_loss": 0.72703731, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74924487, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.6500611305236816 + }, + { + "auxiliary_loss_clip": 0.0116499, + "auxiliary_loss_mlp": 0.01103042, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00052261, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.5846521694770146, + "language_loss": 0.71083581, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73351622, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.5409555435180664 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01102897, + "balance_loss_clip": 1.00167155, + "balance_loss_mlp": 1.00037646, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 3.1543149116281817, + "language_loss": 0.68234336, + "learning_rate": 2.407594853716999e-09, + "loss": 0.7045151, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.593194007873535 + }, + { + "auxiliary_loss_clip": 0.01131311, + "auxiliary_loss_mlp": 0.0110301, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00058568, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 1.8694235675499593, + "language_loss": 0.78856826, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81091142, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.5615234375 + }, + { + "auxiliary_loss_clip": 0.01148194, + "auxiliary_loss_mlp": 0.01102836, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00041127, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4978216808773832, + "language_loss": 0.82233196, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84484226, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 2.6184985637664795 + }, + { + "auxiliary_loss_clip": 0.01135929, + "auxiliary_loss_mlp": 0.01103448, + "balance_loss_clip": 1.00190747, + "balance_loss_mlp": 1.00045085, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.8504414238444802, + "language_loss": 0.74548006, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76787376, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.5662105083465576 + }, + { + "auxiliary_loss_clip": 0.01103495, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.00043583, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.613347340727334, + "language_loss": 0.66142619, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.6834954, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 4.1362831592559814 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00175965, + "balance_loss_mlp": 1.00053608, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.7424109256146705, + "language_loss": 0.70251167, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72486222, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 4.106287240982056 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01102905, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00048053, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 1.9784651084719151, + "language_loss": 0.80872643, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83109295, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.5580806732177734 + }, + { + "auxiliary_loss_clip": 0.01120358, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.00168872, + "balance_loss_mlp": 1.00058067, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.7825950481251691, + "language_loss": 0.67765582, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.69989049, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 2.648810863494873 + }, + { + "auxiliary_loss_clip": 0.01147942, + "auxiliary_loss_mlp": 0.00747207, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00035393, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6318393189050584, + "language_loss": 0.7398684, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75881988, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.520840883255005 + }, + { + "auxiliary_loss_clip": 0.01148324, + "auxiliary_loss_mlp": 0.01101755, + "balance_loss_clip": 1.00163364, + "balance_loss_mlp": 1.00037909, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 2.3211559869351706, + "language_loss": 0.82057476, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84307551, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01135023, + "auxiliary_loss_mlp": 0.00747344, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00042391, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 1.9819321073210077, + "language_loss": 0.6694206, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.68824422, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 4.084212064743042 + }, + { + "auxiliary_loss_clip": 0.01112852, + "auxiliary_loss_mlp": 0.01103868, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00048959, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.626634752505676, + "language_loss": 0.77125341, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79342061, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.727609157562256 + }, + { + "auxiliary_loss_clip": 0.01116537, + "auxiliary_loss_mlp": 0.00747229, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00031018, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 2.326832059868544, + "language_loss": 0.68126369, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.69990134, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.639158248901367 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00041795, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.0643759146296343, + "language_loss": 0.55516917, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57737911, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.5940229892730713 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01103136, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00042522, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.6617303769643916, + "language_loss": 0.79159749, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81381679, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.5620224475860596 + }, + { + "auxiliary_loss_clip": 0.01150305, + "auxiliary_loss_mlp": 0.01103503, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00041032, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.3750089174620808, + "language_loss": 0.76150119, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78403932, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.595088005065918 + }, + { + "auxiliary_loss_clip": 0.0115039, + "auxiliary_loss_mlp": 0.01102712, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00057316, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.050469599739384, + "language_loss": 0.755346, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77787697, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.6240434646606445 + }, + { + "auxiliary_loss_clip": 0.01119039, + "auxiliary_loss_mlp": 0.01101563, + "balance_loss_clip": 1.00178993, + "balance_loss_mlp": 1.00037837, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.8918032140319259, + "language_loss": 0.70696187, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.72916788, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 2.658632516860962 + }, + { + "auxiliary_loss_clip": 0.01133659, + "auxiliary_loss_mlp": 0.01102478, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00053072, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.5719213496225053, + "language_loss": 0.71393776, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.7362991, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.5858383178710938 + }, + { + "auxiliary_loss_clip": 0.01116178, + "auxiliary_loss_mlp": 0.01102404, + "balance_loss_clip": 1.00170052, + "balance_loss_mlp": 1.00045586, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.3595249866878218, + "language_loss": 0.7419287, + "learning_rate": 2.058291183208771e-09, + "loss": 0.7641145, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.649315595626831 + }, + { + "auxiliary_loss_clip": 0.01164901, + "auxiliary_loss_mlp": 0.01102825, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.00049591, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.335304653570754, + "language_loss": 0.575122, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.5977993, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.488759756088257 + }, + { + "auxiliary_loss_clip": 0.01131788, + "auxiliary_loss_mlp": 0.01103973, + "balance_loss_clip": 1.0014776, + "balance_loss_mlp": 1.00040388, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.7944266244863272, + "language_loss": 0.80528045, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82763803, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.550185203552246 + }, + { + "auxiliary_loss_clip": 0.01148113, + "auxiliary_loss_mlp": 0.01103114, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00040293, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.738952137900985, + "language_loss": 0.78211409, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80462635, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.5028576850891113 + }, + { + "auxiliary_loss_clip": 0.01148145, + "auxiliary_loss_mlp": 0.01103439, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00053763, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.707283339040145, + "language_loss": 0.69927025, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72178614, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.5833916664123535 + }, + { + "auxiliary_loss_clip": 0.01150155, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00045657, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 1.6523520302961943, + "language_loss": 0.74484867, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76737237, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.5831613540649414 + }, + { + "auxiliary_loss_clip": 0.01150452, + "auxiliary_loss_mlp": 0.00747428, + "balance_loss_clip": 1.00190508, + "balance_loss_mlp": 1.00041401, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 2.2901796118859643, + "language_loss": 0.70275867, + "learning_rate": 1.953666699415768e-09, + "loss": 0.7217375, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.659687042236328 + }, + { + "auxiliary_loss_clip": 0.01132851, + "auxiliary_loss_mlp": 0.01102403, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00064576, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.754911222464412, + "language_loss": 0.69816196, + "learning_rate": 1.93649446302846e-09, + "loss": 0.72051442, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.6023075580596924 + }, + { + "auxiliary_loss_clip": 0.01087781, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00172985, + "balance_loss_mlp": 1.00047052, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.162634195881391, + "language_loss": 0.74924713, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77115101, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.6293084621429443 + }, + { + "auxiliary_loss_clip": 0.01131373, + "auxiliary_loss_mlp": 0.01101879, + "balance_loss_clip": 1.00163436, + "balance_loss_mlp": 1.00040817, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 1.8355642071110772, + "language_loss": 0.77434361, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79667616, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.5559167861938477 + }, + { + "auxiliary_loss_clip": 0.01148742, + "auxiliary_loss_mlp": 0.01103367, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00037026, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 2.4470222768562953, + "language_loss": 0.67993975, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70246077, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.5288753509521484 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.0107936, + "balance_loss_clip": 1.0011431, + "balance_loss_mlp": 1.00001419, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8033829757038977, + "language_loss": 0.61005509, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63213992, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 4.53123927116394 + }, + { + "auxiliary_loss_clip": 0.01148413, + "auxiliary_loss_mlp": 0.01103415, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00041783, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.429007728413292, + "language_loss": 0.66276252, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.6852808, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.6454696655273438 + }, + { + "auxiliary_loss_clip": 0.01160432, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_clip": 1.00119662, + "balance_loss_mlp": 1.00002575, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7273885247321893, + "language_loss": 0.56229192, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58468992, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.147202253341675 + }, + { + "auxiliary_loss_clip": 0.0111637, + "auxiliary_loss_mlp": 0.01103003, + "balance_loss_clip": 1.00165558, + "balance_loss_mlp": 1.00057876, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 2.943117266275388, + "language_loss": 0.73128772, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75348151, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.667959451675415 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01102576, + "balance_loss_clip": 1.00155735, + "balance_loss_mlp": 1.00043714, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.7262785192744443, + "language_loss": 0.71296895, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73497713, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 2.6944363117218018 + }, + { + "auxiliary_loss_clip": 0.01150479, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00052857, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.5508801458590429, + "language_loss": 0.70184195, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72436959, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 2.511800765991211 + }, + { + "auxiliary_loss_clip": 0.01118833, + "auxiliary_loss_mlp": 0.01101672, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00048745, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.3599730270391743, + "language_loss": 0.75507444, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.7772795, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 2.6398651599884033 + }, + { + "auxiliary_loss_clip": 0.01131401, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00044274, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.2302785936020584, + "language_loss": 0.70614564, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72848737, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.5535335540771484 + }, + { + "auxiliary_loss_clip": 0.01133603, + "auxiliary_loss_mlp": 0.01103667, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00038433, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.5434957848047703, + "language_loss": 0.70627248, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72864515, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.5914177894592285 + }, + { + "auxiliary_loss_clip": 0.011604, + "auxiliary_loss_mlp": 0.01079406, + "balance_loss_clip": 1.00117826, + "balance_loss_mlp": 1.00006008, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6536911820389771, + "language_loss": 0.53746581, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55986381, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.2058844566345215 + }, + { + "auxiliary_loss_clip": 0.01133406, + "auxiliary_loss_mlp": 0.01103775, + "balance_loss_clip": 1.00168395, + "balance_loss_mlp": 1.00058794, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.6284654792616688, + "language_loss": 0.78093028, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80330217, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.596924304962158 + }, + { + "auxiliary_loss_clip": 0.01116275, + "auxiliary_loss_mlp": 0.01102913, + "balance_loss_clip": 1.00204837, + "balance_loss_mlp": 1.00048876, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 2.3751635250225047, + "language_loss": 0.70811218, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.730304, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.624598503112793 + }, + { + "auxiliary_loss_clip": 0.01148606, + "auxiliary_loss_mlp": 0.01103532, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00043964, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.0380194355134438, + "language_loss": 0.82357597, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84609735, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 3.9591422080993652 + }, + { + "auxiliary_loss_clip": 0.01116836, + "auxiliary_loss_mlp": 0.01102421, + "balance_loss_clip": 1.00168204, + "balance_loss_mlp": 1.00047362, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.7224027732997234, + "language_loss": 0.85828012, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88047266, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.676114082336426 + }, + { + "auxiliary_loss_clip": 0.01150233, + "auxiliary_loss_mlp": 0.01102449, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00040603, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 1.9153048229054193, + "language_loss": 0.70061779, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72314459, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 3.898691177368164 + }, + { + "auxiliary_loss_clip": 0.01149904, + "auxiliary_loss_mlp": 0.00747324, + "balance_loss_clip": 1.00183833, + "balance_loss_mlp": 1.0003407, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 1.808485256237832, + "language_loss": 0.80328888, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.8222611, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.5757322311401367 + }, + { + "auxiliary_loss_clip": 0.0110232, + "auxiliary_loss_mlp": 0.01102024, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00045729, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 2.421933465301147, + "language_loss": 0.7999683, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82201171, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.6776881217956543 + }, + { + "auxiliary_loss_clip": 0.01148146, + "auxiliary_loss_mlp": 0.01102609, + "balance_loss_clip": 1.00181973, + "balance_loss_mlp": 1.00056577, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.7750374490892693, + "language_loss": 0.84537685, + "learning_rate": 1.593380599750338e-09, + "loss": 0.8678844, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 3.9095687866210938 + }, + { + "auxiliary_loss_clip": 0.01164972, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00049853, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.9457193550948966, + "language_loss": 0.70124751, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72392261, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.488326072692871 + }, + { + "auxiliary_loss_clip": 0.01116761, + "auxiliary_loss_mlp": 0.01102405, + "balance_loss_clip": 1.00168598, + "balance_loss_mlp": 1.00064766, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 1.9036324921431922, + "language_loss": 0.79986691, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82205856, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.605438709259033 + }, + { + "auxiliary_loss_clip": 0.011649, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.00181687, + "balance_loss_mlp": 1.00055921, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.6334575926765675, + "language_loss": 0.61926138, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64194214, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.662330389022827 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00055599, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.2192378172147167, + "language_loss": 0.72766763, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75034797, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.58351993560791 + }, + { + "auxiliary_loss_clip": 0.01165, + "auxiliary_loss_mlp": 0.01102978, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00074399, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.0474397846153187, + "language_loss": 0.80931246, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83199227, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.471768379211426 + }, + { + "auxiliary_loss_clip": 0.01147654, + "auxiliary_loss_mlp": 0.01101568, + "balance_loss_clip": 1.00170648, + "balance_loss_mlp": 1.00038278, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.8658872889596834, + "language_loss": 0.80593133, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.8284235, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.5457098484039307 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00043321, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 1.9881288232856944, + "language_loss": 0.6497516, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67242658, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.538233518600464 + }, + { + "auxiliary_loss_clip": 0.01149817, + "auxiliary_loss_mlp": 0.01103328, + "balance_loss_clip": 1.0017519, + "balance_loss_mlp": 1.0005219, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 2.72025568439882, + "language_loss": 0.69342637, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71595776, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 2.6377625465393066 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01102703, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00051713, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 2.361079532519334, + "language_loss": 0.75602484, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77805376, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.660080671310425 + }, + { + "auxiliary_loss_clip": 0.01133648, + "auxiliary_loss_mlp": 0.01102856, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00043106, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.337292203389204, + "language_loss": 0.74082839, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76319343, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.5715978145599365 + }, + { + "auxiliary_loss_clip": 0.01114472, + "auxiliary_loss_mlp": 0.01101518, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00052428, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.4869604711195663, + "language_loss": 0.60053003, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62268996, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.6728007793426514 + }, + { + "auxiliary_loss_clip": 0.01135546, + "auxiliary_loss_mlp": 0.01102606, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 4.141130707976114, + "language_loss": 0.71859217, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74097371, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.5917296409606934 + }, + { + "auxiliary_loss_clip": 0.01148131, + "auxiliary_loss_mlp": 0.01102566, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00042772, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.5513476261569477, + "language_loss": 0.5976122, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62011915, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.6101930141448975 + }, + { + "auxiliary_loss_clip": 0.01164939, + "auxiliary_loss_mlp": 0.01102951, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.00043082, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.88792409836426, + "language_loss": 0.76452267, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78720152, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.4542551040649414 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.01103548, + "balance_loss_clip": 1.0017544, + "balance_loss_mlp": 1.00055158, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 2.2542583223445423, + "language_loss": 0.67745095, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69982386, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.823296070098877 + }, + { + "auxiliary_loss_clip": 0.01147618, + "auxiliary_loss_mlp": 0.01101507, + "balance_loss_clip": 1.00173712, + "balance_loss_mlp": 1.00032187, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.1506065723365553, + "language_loss": 0.73986018, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76235151, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.481839418411255 + }, + { + "auxiliary_loss_clip": 0.01132965, + "auxiliary_loss_mlp": 0.01102867, + "balance_loss_clip": 1.00172997, + "balance_loss_mlp": 1.0004425, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 4.088420337341782, + "language_loss": 0.73555005, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75790834, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.5860137939453125 + }, + { + "auxiliary_loss_clip": 0.0110055, + "auxiliary_loss_mlp": 0.01102873, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00054383, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.7508723328491165, + "language_loss": 0.69251657, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71455073, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.699779987335205 + }, + { + "auxiliary_loss_clip": 0.01150276, + "auxiliary_loss_mlp": 0.01103706, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00032783, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.7776020816284, + "language_loss": 0.60803658, + "learning_rate": 1.311740377491155e-09, + "loss": 0.63057643, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.508326292037964 + }, + { + "auxiliary_loss_clip": 0.01130617, + "auxiliary_loss_mlp": 0.01102723, + "balance_loss_clip": 1.00187838, + "balance_loss_mlp": 1.00058448, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 1.98876850284323, + "language_loss": 0.70430553, + "learning_rate": 1.297675079582783e-09, + "loss": 0.72663891, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.535947322845459 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.00747401, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00037384, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 2.6495261245387, + "language_loss": 0.83829612, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8574208, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 3.9673500061035156 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01102075, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00041294, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.7331474091011252, + "language_loss": 0.69824505, + "learning_rate": 1.26977185727406e-09, + "loss": 0.7207458, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.5146608352661133 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01103108, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00049233, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 1.9401156718435295, + "language_loss": 0.74023801, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76275253, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.648606061935425 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_clip": 1.00174415, + "balance_loss_mlp": 1.00044763, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.9312704607031843, + "language_loss": 0.79782879, + "learning_rate": 1.242171803164549e-09, + "loss": 0.82051188, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.480404853820801 + }, + { + "auxiliary_loss_clip": 0.01117963, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_clip": 1.0017519, + "balance_loss_mlp": 1.00038099, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 1.9157949165692196, + "language_loss": 0.69941187, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72162139, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.637406587600708 + }, + { + "auxiliary_loss_clip": 0.01164875, + "auxiliary_loss_mlp": 0.01101549, + "balance_loss_clip": 1.00195038, + "balance_loss_mlp": 1.00045967, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.6763019006510211, + "language_loss": 0.73934865, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76201284, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 2.479875326156616 + }, + { + "auxiliary_loss_clip": 0.01100452, + "auxiliary_loss_mlp": 0.01101602, + "balance_loss_clip": 1.00149763, + "balance_loss_mlp": 1.0005126, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 5.960095171937813, + "language_loss": 0.69404912, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.7160697, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 2.7508368492126465 + }, + { + "auxiliary_loss_clip": 0.01133316, + "auxiliary_loss_mlp": 0.01102243, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.85808988661251, + "language_loss": 0.75971454, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78207016, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.566025733947754 + }, + { + "auxiliary_loss_clip": 0.01132149, + "auxiliary_loss_mlp": 0.01101314, + "balance_loss_clip": 1.00185645, + "balance_loss_mlp": 1.00041533, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.8518617483665303, + "language_loss": 0.65522838, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67756301, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.613879919052124 + }, + { + "auxiliary_loss_clip": 0.01148325, + "auxiliary_loss_mlp": 0.01103129, + "balance_loss_clip": 1.00183558, + "balance_loss_mlp": 1.00051379, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.8016968590453342, + "language_loss": 0.73780555, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76032007, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.5279359817504883 + }, + { + "auxiliary_loss_clip": 0.01165055, + "auxiliary_loss_mlp": 0.01102633, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.00039935, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 1.9072098169906782, + "language_loss": 0.69252658, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71520352, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.561074733734131 + }, + { + "auxiliary_loss_clip": 0.01147941, + "auxiliary_loss_mlp": 0.01102166, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00040901, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.8279445613399852, + "language_loss": 0.79120207, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81370312, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 3.9793715476989746 + }, + { + "auxiliary_loss_clip": 0.01131618, + "auxiliary_loss_mlp": 0.01103164, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00054824, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 1.689506820563453, + "language_loss": 0.70562291, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.72797072, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.602017641067505 + }, + { + "auxiliary_loss_clip": 0.01132568, + "auxiliary_loss_mlp": 0.01103319, + "balance_loss_clip": 1.00167751, + "balance_loss_mlp": 1.00041771, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.6251336591888779, + "language_loss": 0.8726939, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89505273, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 4.016450643539429 + }, + { + "auxiliary_loss_clip": 0.01147738, + "auxiliary_loss_mlp": 0.01103046, + "balance_loss_clip": 1.00176156, + "balance_loss_mlp": 1.00043106, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.6433058001853946, + "language_loss": 0.6291523, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65166014, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.554093837738037 + }, + { + "auxiliary_loss_clip": 0.01150111, + "auxiliary_loss_mlp": 0.01103021, + "balance_loss_clip": 1.0018326, + "balance_loss_mlp": 1.00050116, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.708705143357289, + "language_loss": 0.7276535, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75018477, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 2.5077359676361084 + }, + { + "auxiliary_loss_clip": 0.01149383, + "auxiliary_loss_mlp": 0.01102884, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00036407, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 2.015923875199277, + "language_loss": 0.69933307, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72185576, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01102719, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00039005, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0172244846931418, + "language_loss": 0.73492312, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75713015, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 4.01584792137146 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.01101552, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00046301, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.7053446561214174, + "language_loss": 0.8646431, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88730717, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.6183383464813232 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01103962, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00039315, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.5754135290277276, + "language_loss": 0.71428657, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73648494, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.653486728668213 + }, + { + "auxiliary_loss_clip": 0.01135345, + "auxiliary_loss_mlp": 0.01102028, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00046134, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.269301046440758, + "language_loss": 0.64990902, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67228276, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.6908671855926514 + }, + { + "auxiliary_loss_clip": 0.01116389, + "auxiliary_loss_mlp": 0.01103783, + "balance_loss_clip": 1.00158715, + "balance_loss_mlp": 1.00050032, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 3.45857092168492, + "language_loss": 0.61680621, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.63900799, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.6787033081054688 + }, + { + "auxiliary_loss_clip": 0.01116292, + "auxiliary_loss_mlp": 0.01103274, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00037265, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.6617241680346724, + "language_loss": 0.70420849, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72640419, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.5785157680511475 + }, + { + "auxiliary_loss_clip": 0.01143709, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_clip": 1.00115561, + "balance_loss_mlp": 0.99999285, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6691357031125805, + "language_loss": 0.55537063, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57760495, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.31077241897583 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.0110298, + "balance_loss_clip": 1.00170541, + "balance_loss_mlp": 1.00055552, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.3691072882165543, + "language_loss": 0.83723581, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85958362, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.560926914215088 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01103058, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00044274, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.7833572774033877, + "language_loss": 0.85267496, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87518412, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.5537452697753906 + }, + { + "auxiliary_loss_clip": 0.01164925, + "auxiliary_loss_mlp": 0.01101788, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.717466137911386, + "language_loss": 0.84808576, + "learning_rate": 9.465627102240859e-10, + "loss": 0.87075287, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.597087860107422 + }, + { + "auxiliary_loss_clip": 0.01131236, + "auxiliary_loss_mlp": 0.01101841, + "balance_loss_clip": 1.00154972, + "balance_loss_mlp": 1.00046575, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7441266732139826, + "language_loss": 0.76685554, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78918636, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.566180944442749 + }, + { + "auxiliary_loss_clip": 0.01118491, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.0003469, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 2.698700332300817, + "language_loss": 0.75740147, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77961409, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.6158909797668457 + }, + { + "auxiliary_loss_clip": 0.01148559, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00043428, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 3.3226171651173977, + "language_loss": 0.67525995, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69421941, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.5196993350982666 + }, + { + "auxiliary_loss_clip": 0.0113325, + "auxiliary_loss_mlp": 0.01102883, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00055337, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 1.814663157799185, + "language_loss": 0.71942735, + "learning_rate": 8.992457045289282e-10, + "loss": 0.74178863, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.5715155601501465 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01103939, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00065613, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 1.9754722152446311, + "language_loss": 0.80990016, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83258986, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.4552605152130127 + }, + { + "auxiliary_loss_clip": 0.01150404, + "auxiliary_loss_mlp": 0.0110307, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00045431, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.5809838268351324, + "language_loss": 0.66077375, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68330854, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.58566951751709 + }, + { + "auxiliary_loss_clip": 0.01148047, + "auxiliary_loss_mlp": 0.01102846, + "balance_loss_clip": 1.00168586, + "balance_loss_mlp": 1.00051713, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.8691948432883885, + "language_loss": 0.72080696, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74331582, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.6405715942382812 + }, + { + "auxiliary_loss_clip": 0.01150125, + "auxiliary_loss_mlp": 0.01102109, + "balance_loss_clip": 1.00188434, + "balance_loss_mlp": 1.00035179, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.797114734778901, + "language_loss": 0.77684838, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79937071, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.5247721672058105 + }, + { + "auxiliary_loss_clip": 0.01149886, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_clip": 1.00186944, + "balance_loss_mlp": 1.00041437, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.782920887125169, + "language_loss": 0.75516391, + "learning_rate": 8.418050878944427e-10, + "loss": 0.7776835, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.514997959136963 + }, + { + "auxiliary_loss_clip": 0.01146013, + "auxiliary_loss_mlp": 0.01079361, + "balance_loss_clip": 1.0011493, + "balance_loss_mlp": 1.00001574, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6747047817736729, + "language_loss": 0.5363372, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55859095, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.2389047145843506 + }, + { + "auxiliary_loss_clip": 0.01164777, + "auxiliary_loss_mlp": 0.01102015, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00054371, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.7839410788628227, + "language_loss": 0.8231703, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84583831, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.4921844005584717 + }, + { + "auxiliary_loss_clip": 0.01118788, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_clip": 1.00171065, + "balance_loss_mlp": 1.00067389, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.574566149923884, + "language_loss": 0.8134774, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83568573, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 4.044561862945557 + }, + { + "auxiliary_loss_clip": 0.01149426, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00053596, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.4158140737651836, + "language_loss": 0.65818274, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68070757, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.7435035705566406 + }, + { + "auxiliary_loss_clip": 0.01148211, + "auxiliary_loss_mlp": 0.00747427, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00043583, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.509988686046816, + "language_loss": 0.76640177, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78535813, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.567776679992676 + }, + { + "auxiliary_loss_clip": 0.01116974, + "auxiliary_loss_mlp": 0.01104125, + "balance_loss_clip": 1.00172997, + "balance_loss_mlp": 1.00036561, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 3.4275649496377754, + "language_loss": 0.68716621, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70937723, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.6357460021972656 + }, + { + "auxiliary_loss_clip": 0.01108934, + "auxiliary_loss_mlp": 0.01079384, + "balance_loss_clip": 1.00120473, + "balance_loss_mlp": 1.00003839, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6104904452156475, + "language_loss": 0.52586246, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54774559, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.2923622131347656 + }, + { + "auxiliary_loss_clip": 0.01120929, + "auxiliary_loss_mlp": 0.0110415, + "balance_loss_clip": 1.00173581, + "balance_loss_mlp": 1.00048602, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.616094887193325, + "language_loss": 0.75841522, + "learning_rate": 7.538421534734052e-10, + "loss": 0.78066599, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.630018949508667 + }, + { + "auxiliary_loss_clip": 0.01100462, + "auxiliary_loss_mlp": 0.01104021, + "balance_loss_clip": 1.00168824, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.229381611846006, + "language_loss": 0.70476782, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72681266, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.6084463596343994 + }, + { + "auxiliary_loss_clip": 0.01118623, + "auxiliary_loss_mlp": 0.01102668, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.0004344, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 7.056423862906479, + "language_loss": 0.68610591, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70831883, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.6254279613494873 + }, + { + "auxiliary_loss_clip": 0.01131851, + "auxiliary_loss_mlp": 0.01103758, + "balance_loss_clip": 1.00175238, + "balance_loss_mlp": 1.00057018, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.7104439687076165, + "language_loss": 0.71371353, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73606962, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.643380880355835 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01102733, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00040364, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 1.8590041571317768, + "language_loss": 0.68183726, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70434809, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.505723237991333 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01079418, + "balance_loss_clip": 1.0011481, + "balance_loss_mlp": 1.00007236, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7175183354348525, + "language_loss": 0.53484803, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55694067, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.279142141342163 + }, + { + "auxiliary_loss_clip": 0.01133698, + "auxiliary_loss_mlp": 0.00747426, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00035882, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.5860539062751424, + "language_loss": 0.71425825, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73306942, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 3.9832241535186768 + }, + { + "auxiliary_loss_clip": 0.01116906, + "auxiliary_loss_mlp": 0.01103653, + "balance_loss_clip": 1.00169396, + "balance_loss_mlp": 1.00046515, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.593723109871848, + "language_loss": 0.82080984, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84301543, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.6014511585235596 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01102459, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00051105, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.6750572826021373, + "language_loss": 0.68332314, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70552051, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 4.046942472457886 + }, + { + "auxiliary_loss_clip": 0.01165102, + "auxiliary_loss_mlp": 0.0110326, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.0003581, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1.8835578084087992, + "language_loss": 0.82475501, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84743857, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.524099349975586 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.0110255, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00041151, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.674306060513571, + "language_loss": 0.81923389, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84156966, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.6048758029937744 + }, + { + "auxiliary_loss_clip": 0.01150308, + "auxiliary_loss_mlp": 0.01102388, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00053585, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 2.190969789817493, + "language_loss": 0.77445596, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79698288, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.557748317718506 + }, + { + "auxiliary_loss_clip": 0.01131918, + "auxiliary_loss_mlp": 0.0110318, + "balance_loss_clip": 1.00168431, + "balance_loss_mlp": 1.00042117, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.0984669628592667, + "language_loss": 0.71868682, + "learning_rate": 6.309952072811597e-10, + "loss": 0.74103773, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 3.9611361026763916 + }, + { + "auxiliary_loss_clip": 0.01145801, + "auxiliary_loss_mlp": 0.01079318, + "balance_loss_clip": 1.00119591, + "balance_loss_mlp": 0.99997282, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6293132853082128, + "language_loss": 0.55118048, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57343173, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 3.218848466873169 + }, + { + "auxiliary_loss_clip": 0.01117484, + "auxiliary_loss_mlp": 0.01102189, + "balance_loss_clip": 1.00163531, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 2.256944408799171, + "language_loss": 0.69596291, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71815968, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.6320743560791016 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01104389, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 1.7501257773814733, + "language_loss": 0.65342969, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67566705, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.6229135990142822 + }, + { + "auxiliary_loss_clip": 0.01083542, + "auxiliary_loss_mlp": 0.011024, + "balance_loss_clip": 1.001508, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 2.650623918846368, + "language_loss": 0.62833929, + "learning_rate": 5.924723134487219e-10, + "loss": 0.6501987, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 2.6899755001068115 + }, + { + "auxiliary_loss_clip": 0.01164996, + "auxiliary_loss_mlp": 0.01103277, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.00047076, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.0713215310361135, + "language_loss": 0.72524107, + "learning_rate": 5.830311334193983e-10, + "loss": 0.74792379, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.484384775161743 + }, + { + "auxiliary_loss_clip": 0.01164854, + "auxiliary_loss_mlp": 0.01102875, + "balance_loss_clip": 1.0018059, + "balance_loss_mlp": 1.00035548, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.5374175392354419, + "language_loss": 0.70367908, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72635639, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.5314905643463135 + }, + { + "auxiliary_loss_clip": 0.01148191, + "auxiliary_loss_mlp": 0.01103625, + "balance_loss_clip": 1.0017091, + "balance_loss_mlp": 1.00053263, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 2.3125307739526826, + "language_loss": 0.68259525, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70511341, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 2.8909831047058105 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.0110331, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00040817, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.022702960053188, + "language_loss": 0.80826223, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83046424, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.6200361251831055 + }, + { + "auxiliary_loss_clip": 0.01117243, + "auxiliary_loss_mlp": 0.01101986, + "balance_loss_clip": 1.0016284, + "balance_loss_mlp": 1.00041962, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.7817493140357958, + "language_loss": 0.91590041, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93809271, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.6446774005889893 + }, + { + "auxiliary_loss_clip": 0.01112689, + "auxiliary_loss_mlp": 0.01079323, + "balance_loss_clip": 1.00112426, + "balance_loss_mlp": 0.99997741, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.6947494098308709, + "language_loss": 0.55184704, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57376713, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.3245513439178467 + }, + { + "auxiliary_loss_clip": 0.01133381, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.0004493, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.3459345367221245, + "language_loss": 0.65005499, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67241752, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.905944347381592 + }, + { + "auxiliary_loss_clip": 0.01120997, + "auxiliary_loss_mlp": 0.01103105, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00039399, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 14.307589652507355, + "language_loss": 0.73045671, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75269777, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.6018035411834717 + }, + { + "auxiliary_loss_clip": 0.01119095, + "auxiliary_loss_mlp": 0.01103051, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00043571, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 2.2120175378865365, + "language_loss": 0.77041924, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79264069, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.6359994411468506 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01101116, + "balance_loss_clip": 1.00161433, + "balance_loss_mlp": 1.00040841, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.3783543159953697, + "language_loss": 0.77928966, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80148447, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.6281988620758057 + }, + { + "auxiliary_loss_clip": 0.01148902, + "auxiliary_loss_mlp": 0.01104446, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.0004952, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.296496445038925, + "language_loss": 0.67422712, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69676059, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.502751350402832 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01078982, + "balance_loss_clip": 1.00145578, + "balance_loss_mlp": 1.00001812, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7304922399269802, + "language_loss": 0.53458172, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55667031, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.0392093658447266 + }, + { + "auxiliary_loss_clip": 0.01119164, + "auxiliary_loss_mlp": 0.01103023, + "balance_loss_clip": 1.00182354, + "balance_loss_mlp": 1.00040746, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.6334649659589568, + "language_loss": 0.60064793, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62286985, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.5883705615997314 + }, + { + "auxiliary_loss_clip": 0.01131075, + "auxiliary_loss_mlp": 0.01102632, + "balance_loss_clip": 1.00199878, + "balance_loss_mlp": 1.0003978, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.5686419666343745, + "language_loss": 0.61974555, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64208257, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.7170379161834717 + }, + { + "auxiliary_loss_clip": 0.01133107, + "auxiliary_loss_mlp": 0.01104116, + "balance_loss_clip": 1.00189388, + "balance_loss_mlp": 1.00054705, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.5226083988566574, + "language_loss": 0.74224746, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76461965, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.5997416973114014 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01101933, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00046194, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.5002068408946578, + "language_loss": 0.73460865, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.756796, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.645317316055298 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.00747375, + "balance_loss_clip": 1.0017302, + "balance_loss_mlp": 1.00043643, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.796003605166982, + "language_loss": 0.7139523, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73275512, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 4.078955411911011 + }, + { + "auxiliary_loss_clip": 0.01114867, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_clip": 1.00165653, + "balance_loss_mlp": 1.00049675, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.9580947183612551, + "language_loss": 0.79804826, + "learning_rate": 4.341315219624775e-10, + "loss": 0.82022035, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.6058855056762695 + }, + { + "auxiliary_loss_clip": 0.01112571, + "auxiliary_loss_mlp": 0.01102426, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00038266, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.6830339360371516, + "language_loss": 0.74479324, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76694322, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.6215264797210693 + }, + { + "auxiliary_loss_clip": 0.01164715, + "auxiliary_loss_mlp": 0.00747363, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00037408, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.4607657844142667, + "language_loss": 0.72689366, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74601448, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 2.5797441005706787 + }, + { + "auxiliary_loss_clip": 0.01132534, + "auxiliary_loss_mlp": 0.01102662, + "balance_loss_clip": 1.00164568, + "balance_loss_mlp": 1.00052333, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 1.9809640872743934, + "language_loss": 0.75931138, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.7816633, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.5698957443237305 + }, + { + "auxiliary_loss_clip": 0.01135122, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00035536, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 3.11795159503194, + "language_loss": 0.6771487, + "learning_rate": 4.022808578922898e-10, + "loss": 0.69952583, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 2.6062731742858887 + }, + { + "auxiliary_loss_clip": 0.01148544, + "auxiliary_loss_mlp": 0.01104032, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.00055814, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 1.9024081955158434, + "language_loss": 0.65703845, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67956418, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.5617687702178955 + }, + { + "auxiliary_loss_clip": 0.01150522, + "auxiliary_loss_mlp": 0.01103314, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00041282, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.1853146706273683, + "language_loss": 0.70666057, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.72919893, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.5324249267578125 + }, + { + "auxiliary_loss_clip": 0.01150576, + "auxiliary_loss_mlp": 0.01103029, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00041389, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.480040788399966, + "language_loss": 0.74176627, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76430231, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.62251353263855 + }, + { + "auxiliary_loss_clip": 0.01100246, + "auxiliary_loss_mlp": 0.01102026, + "balance_loss_clip": 1.00161028, + "balance_loss_mlp": 1.0005554, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.5203841731397625, + "language_loss": 0.70615852, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72818124, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.01148354, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00033283, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.2047286724057122, + "language_loss": 0.84496129, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86748481, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 3.911735773086548 + }, + { + "auxiliary_loss_clip": 0.01116573, + "auxiliary_loss_mlp": 0.01101723, + "balance_loss_clip": 1.00167394, + "balance_loss_mlp": 1.00034761, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.3444727605305693, + "language_loss": 0.65960246, + "learning_rate": 3.567796158934211e-10, + "loss": 0.6817854, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.644120454788208 + }, + { + "auxiliary_loss_clip": 0.01114735, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00041616, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.4526481173069428, + "language_loss": 0.64779627, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66997302, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.586890459060669 + }, + { + "auxiliary_loss_clip": 0.01114212, + "auxiliary_loss_mlp": 0.01103329, + "balance_loss_clip": 1.001984, + "balance_loss_mlp": 1.00042701, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.6885463047861526, + "language_loss": 0.7823028, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80447823, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 4.033175230026245 + }, + { + "auxiliary_loss_clip": 0.01149517, + "auxiliary_loss_mlp": 0.01103472, + "balance_loss_clip": 1.00181091, + "balance_loss_mlp": 1.00047517, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.7085037701666785, + "language_loss": 0.68808973, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71061957, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.5552027225494385 + }, + { + "auxiliary_loss_clip": 0.01115852, + "auxiliary_loss_mlp": 0.01102046, + "balance_loss_clip": 1.00159216, + "balance_loss_mlp": 1.00038457, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.9214752643982738, + "language_loss": 0.7549212, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77710009, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.6106369495391846 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.01103203, + "balance_loss_clip": 1.00165319, + "balance_loss_mlp": 1.00049186, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 1.9509780970420099, + "language_loss": 0.70498228, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72701913, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 4.089872598648071 + }, + { + "auxiliary_loss_clip": 0.01148028, + "auxiliary_loss_mlp": 0.0110166, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00038028, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.8920410844183313, + "language_loss": 0.7512244, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77372128, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.53837251663208 + }, + { + "auxiliary_loss_clip": 0.01130304, + "auxiliary_loss_mlp": 0.01104032, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00055861, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.7803677039222352, + "language_loss": 0.76625323, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78859657, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.5778861045837402 + }, + { + "auxiliary_loss_clip": 0.01148235, + "auxiliary_loss_mlp": 0.01103332, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00043023, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 1.947105462660755, + "language_loss": 0.74234575, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.7648614, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 2.5377423763275146 + }, + { + "auxiliary_loss_clip": 0.01150674, + "auxiliary_loss_mlp": 0.01104076, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00041151, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.732216477339199, + "language_loss": 0.82174289, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84429049, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.505169153213501 + }, + { + "auxiliary_loss_clip": 0.01164846, + "auxiliary_loss_mlp": 0.01102866, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00044107, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.704085351583167, + "language_loss": 0.78446436, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80714154, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.4762520790100098 + }, + { + "auxiliary_loss_clip": 0.01120744, + "auxiliary_loss_mlp": 0.01102824, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.00039935, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.6671252219994626, + "language_loss": 0.72102177, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.7432574, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 2.6670761108398438 + }, + { + "auxiliary_loss_clip": 0.01150252, + "auxiliary_loss_mlp": 0.01102737, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00059891, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 2.230547821538879, + "language_loss": 0.77580309, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79833305, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.5343034267425537 + }, + { + "auxiliary_loss_clip": 0.01132951, + "auxiliary_loss_mlp": 0.01102135, + "balance_loss_clip": 1.00165749, + "balance_loss_mlp": 1.00037801, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 1.93265974363176, + "language_loss": 0.69765508, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72000593, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.5846333503723145 + }, + { + "auxiliary_loss_clip": 0.01150322, + "auxiliary_loss_mlp": 0.01103005, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00038981, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.6575372188619493, + "language_loss": 0.75299549, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77552885, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.5051493644714355 + }, + { + "auxiliary_loss_clip": 0.01117515, + "auxiliary_loss_mlp": 0.01103557, + "balance_loss_clip": 1.00174546, + "balance_loss_mlp": 1.0004648, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.6623430227305225, + "language_loss": 0.74487066, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76708144, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.676740884780884 + }, + { + "auxiliary_loss_clip": 0.01102262, + "auxiliary_loss_mlp": 0.00747338, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00043368, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 4.370243261296745, + "language_loss": 0.78101307, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.79950905, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.6630241870880127 + }, + { + "auxiliary_loss_clip": 0.01147923, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00050652, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.4087577353622924, + "language_loss": 0.6652745, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68776584, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.5376861095428467 + }, + { + "auxiliary_loss_clip": 0.01164807, + "auxiliary_loss_mlp": 0.01102564, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00042558, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3144379272932483, + "language_loss": 0.813187, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83586073, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.575418710708618 + }, + { + "auxiliary_loss_clip": 0.01143176, + "auxiliary_loss_mlp": 0.01078968, + "balance_loss_clip": 1.00103533, + "balance_loss_mlp": 1.00000334, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7186227617429927, + "language_loss": 0.57325989, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59548128, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.224931478500366 + }, + { + "auxiliary_loss_clip": 0.0114566, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_clip": 1.00188839, + "balance_loss_mlp": 1.00058293, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.5668520500324836, + "language_loss": 0.77283227, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79531801, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.538555383682251 + }, + { + "auxiliary_loss_clip": 0.01120314, + "auxiliary_loss_mlp": 0.01102673, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00043893, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 1.8764435114728084, + "language_loss": 0.85984921, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88207901, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.631014823913574 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01102137, + "balance_loss_clip": 1.00163007, + "balance_loss_mlp": 1.00047565, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.6157258469033764, + "language_loss": 0.72888297, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75090265, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.6328911781311035 + }, + { + "auxiliary_loss_clip": 0.01149755, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00050211, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.5899179457647261, + "language_loss": 0.76270998, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78522968, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 2.6185216903686523 + }, + { + "auxiliary_loss_clip": 0.01133534, + "auxiliary_loss_mlp": 0.01102336, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00034034, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.8369738393771138, + "language_loss": 0.63052946, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65288818, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.6394894123077393 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_clip": 1.0019089, + "balance_loss_mlp": 1.0004766, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 1.7533242697284346, + "language_loss": 0.74229276, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76482308, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.553102493286133 + }, + { + "auxiliary_loss_clip": 0.01087354, + "auxiliary_loss_mlp": 0.01102233, + "balance_loss_clip": 1.00166917, + "balance_loss_mlp": 1.00047648, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.853599044399274, + "language_loss": 0.78847331, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81036919, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 4.088626861572266 + }, + { + "auxiliary_loss_clip": 0.01164977, + "auxiliary_loss_mlp": 0.01102096, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.0004344, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.3380915292851063, + "language_loss": 0.65485799, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67752874, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.453824996948242 + }, + { + "auxiliary_loss_clip": 0.01132095, + "auxiliary_loss_mlp": 0.00747399, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00049067, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.6126082399097073, + "language_loss": 0.64230227, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66109717, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.64200758934021 + }, + { + "auxiliary_loss_clip": 0.01131451, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_clip": 1.00170171, + "balance_loss_mlp": 1.00046992, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.8555499564024975, + "language_loss": 0.64906925, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.67140794, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 2.6353800296783447 + }, + { + "auxiliary_loss_clip": 0.01131422, + "auxiliary_loss_mlp": 0.00747286, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00040245, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 2.0260630660872616, + "language_loss": 0.73809391, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.756881, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.566251516342163 + }, + { + "auxiliary_loss_clip": 0.01133336, + "auxiliary_loss_mlp": 0.01101626, + "balance_loss_clip": 1.00151062, + "balance_loss_mlp": 1.00034606, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.771099369650464, + "language_loss": 0.79253763, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81488729, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 2.5656628608703613 + }, + { + "auxiliary_loss_clip": 0.01100349, + "auxiliary_loss_mlp": 0.00747359, + "balance_loss_clip": 1.00167966, + "balance_loss_mlp": 1.00039899, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.5053522605829992, + "language_loss": 0.70888567, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72736275, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.6890692710876465 + }, + { + "auxiliary_loss_clip": 0.01150246, + "auxiliary_loss_mlp": 0.01103712, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00042939, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.090319795830276, + "language_loss": 0.78226495, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.8048045, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.5666518211364746 + }, + { + "auxiliary_loss_clip": 0.01148054, + "auxiliary_loss_mlp": 0.01101262, + "balance_loss_clip": 1.00178683, + "balance_loss_mlp": 1.00045919, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.6751317575225404, + "language_loss": 0.82219791, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.8446911, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.546858787536621 + }, + { + "auxiliary_loss_clip": 0.01117106, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00040984, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.6262784112971984, + "language_loss": 0.70172727, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72037196, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.638833522796631 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.0110224, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00048316, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.569263931937685, + "language_loss": 0.74935877, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77157009, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.6641016006469727 + }, + { + "auxiliary_loss_clip": 0.01133595, + "auxiliary_loss_mlp": 0.0110257, + "balance_loss_clip": 1.00173771, + "balance_loss_mlp": 1.0004313, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 2.36386953631479, + "language_loss": 0.79440844, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81677008, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 4.155328989028931 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01103248, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.00034678, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 2.4982686521326034, + "language_loss": 0.70399892, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72634041, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.6439340114593506 + }, + { + "auxiliary_loss_clip": 0.01131188, + "auxiliary_loss_mlp": 0.01104201, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.00063217, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 1.7391383837562597, + "language_loss": 0.62983745, + "learning_rate": 1.275618614968721e-10, + "loss": 0.6521914, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.626481056213379 + }, + { + "auxiliary_loss_clip": 0.01115079, + "auxiliary_loss_mlp": 0.01104648, + "balance_loss_clip": 1.00188482, + "balance_loss_mlp": 1.00050676, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 5.840839203191363, + "language_loss": 0.76226223, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78445959, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 3.9398298263549805 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.0110258, + "balance_loss_clip": 1.00184619, + "balance_loss_mlp": 1.00044131, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 1.9338001463419368, + "language_loss": 0.70458812, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72694898, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.5702381134033203 + }, + { + "auxiliary_loss_clip": 0.01148375, + "auxiliary_loss_mlp": 0.01102395, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00035167, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.6021276773456419, + "language_loss": 0.71775877, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74026644, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.5392746925354004 + }, + { + "auxiliary_loss_clip": 0.01131505, + "auxiliary_loss_mlp": 0.01102816, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00048733, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 1.9324517002402086, + "language_loss": 0.78202873, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80437195, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 3.939577579498291 + }, + { + "auxiliary_loss_clip": 0.01100051, + "auxiliary_loss_mlp": 0.00747308, + "balance_loss_clip": 1.00162029, + "balance_loss_mlp": 1.00031412, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 4.547191059323187, + "language_loss": 0.76107121, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.77954477, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.6746270656585693 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.01103863, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.0004847, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 1.8632859571839542, + "language_loss": 0.69025439, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.7126112, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.7412967681884766 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01102249, + "balance_loss_clip": 1.00150537, + "balance_loss_mlp": 1.00039673, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.7775410083686063, + "language_loss": 0.79640257, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81842411, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 2.692991018295288 + }, + { + "auxiliary_loss_clip": 0.01131834, + "auxiliary_loss_mlp": 0.01101766, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00039077, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 2.1451780835919654, + "language_loss": 0.80175102, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82408702, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 2.6256356239318848 + }, + { + "auxiliary_loss_clip": 0.0114951, + "auxiliary_loss_mlp": 0.01102113, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 1.00045109, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.6693886108718337, + "language_loss": 0.60739899, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62991524, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.543921947479248 + }, + { + "auxiliary_loss_clip": 0.01132515, + "auxiliary_loss_mlp": 0.01102082, + "balance_loss_clip": 1.00171673, + "balance_loss_mlp": 1.00042033, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.8549479958359754, + "language_loss": 0.77621031, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79855621, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.6152336597442627 + }, + { + "auxiliary_loss_clip": 0.01131383, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_clip": 1.00167453, + "balance_loss_mlp": 1.00045371, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.3659173498188542, + "language_loss": 0.69378853, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71612155, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.6045029163360596 + }, + { + "auxiliary_loss_clip": 0.01164911, + "auxiliary_loss_mlp": 0.01102814, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00038934, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.3671690308870637, + "language_loss": 0.81646788, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83914518, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 2.46929669380188 + }, + { + "auxiliary_loss_clip": 0.01133445, + "auxiliary_loss_mlp": 0.0110267, + "balance_loss_clip": 1.00172997, + "balance_loss_mlp": 1.00043654, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.5696599450323094, + "language_loss": 0.77869767, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80105877, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.5980894565582275 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00045776, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 2.100387487468699, + "language_loss": 0.72709155, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74941725, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.579160690307617 + }, + { + "auxiliary_loss_clip": 0.01165054, + "auxiliary_loss_mlp": 0.01104238, + "balance_loss_clip": 1.00186193, + "balance_loss_mlp": 1.00047803, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.482506655113205, + "language_loss": 0.82280457, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84549749, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.5225510597229004 + }, + { + "auxiliary_loss_clip": 0.01150289, + "auxiliary_loss_mlp": 0.01104191, + "balance_loss_clip": 1.00181711, + "balance_loss_mlp": 1.00043106, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.3982127290897655, + "language_loss": 0.70229089, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72483575, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.5297963619232178 + }, + { + "auxiliary_loss_clip": 0.01133997, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_clip": 1.00182474, + "balance_loss_mlp": 1.00056362, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.6260305970976798, + "language_loss": 0.63696647, + "learning_rate": 6.374335005676634e-11, + "loss": 0.6593411, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.6621766090393066 + }, + { + "auxiliary_loss_clip": 0.01131853, + "auxiliary_loss_mlp": 0.01102505, + "balance_loss_clip": 1.001647, + "balance_loss_mlp": 1.00046217, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 5.931388640155084, + "language_loss": 0.72776592, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75010955, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.69643235206604 + }, + { + "auxiliary_loss_clip": 0.01131362, + "auxiliary_loss_mlp": 0.01103207, + "balance_loss_clip": 1.00173116, + "balance_loss_mlp": 1.00049639, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.4536594080773402, + "language_loss": 0.85226178, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87460744, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.564527988433838 + }, + { + "auxiliary_loss_clip": 0.01164754, + "auxiliary_loss_mlp": 0.0074724, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00033712, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 3.159746763583719, + "language_loss": 0.69348955, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71260947, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.489037036895752 + }, + { + "auxiliary_loss_clip": 0.01100075, + "auxiliary_loss_mlp": 0.01103353, + "balance_loss_clip": 1.00169349, + "balance_loss_mlp": 1.00045133, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.174197342347467, + "language_loss": 0.7289542, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.75098848, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.648343563079834 + }, + { + "auxiliary_loss_clip": 0.01139524, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_clip": 1.00119436, + "balance_loss_mlp": 0.99995369, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.9066956922238122, + "language_loss": 0.60457468, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62676287, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 3.0098509788513184 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01102957, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00043702, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 2.94096147812411, + "language_loss": 0.77547836, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79799187, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.5337798595428467 + }, + { + "auxiliary_loss_clip": 0.01150341, + "auxiliary_loss_mlp": 0.01103045, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00042927, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.9226429423195355, + "language_loss": 0.81835806, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84089196, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 4.050463438034058 + }, + { + "auxiliary_loss_clip": 0.01129291, + "auxiliary_loss_mlp": 0.01104592, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00064158, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.0625502875613932, + "language_loss": 0.64226794, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66460675, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.5785746574401855 + }, + { + "auxiliary_loss_clip": 0.01127752, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_clip": 1.00120449, + "balance_loss_mlp": 1.00012243, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8517765801731427, + "language_loss": 0.62369388, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64576989, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.0517566204071045 + }, + { + "auxiliary_loss_clip": 0.01131509, + "auxiliary_loss_mlp": 0.01101627, + "balance_loss_clip": 1.00170279, + "balance_loss_mlp": 1.00044227, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.413270295538381, + "language_loss": 0.78668487, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80901629, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.56449556350708 + }, + { + "auxiliary_loss_clip": 0.01117198, + "auxiliary_loss_mlp": 0.01103221, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00051057, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.9912249362005907, + "language_loss": 0.82444799, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84665215, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.6212775707244873 + }, + { + "auxiliary_loss_clip": 0.01115232, + "auxiliary_loss_mlp": 0.01104195, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00053024, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.171096789832959, + "language_loss": 0.62483537, + "learning_rate": 3.189071962883538e-11, + "loss": 0.6470297, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.6377246379852295 + }, + { + "auxiliary_loss_clip": 0.01133707, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_clip": 1.00188839, + "balance_loss_mlp": 1.00050378, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 2.1091226075659546, + "language_loss": 0.7102043, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73258114, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 2.5946295261383057 + }, + { + "auxiliary_loss_clip": 0.01164942, + "auxiliary_loss_mlp": 0.01102698, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00036907, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.6078913107839028, + "language_loss": 0.64434153, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66701794, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.4939382076263428 + }, + { + "auxiliary_loss_clip": 0.01116303, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.00169933, + "balance_loss_mlp": 1.00040936, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.8381325891768328, + "language_loss": 0.71207643, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73425347, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 2.5754647254943848 + }, + { + "auxiliary_loss_clip": 0.01148251, + "auxiliary_loss_mlp": 0.00747395, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00039911, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.9067502582532825, + "language_loss": 0.81662852, + "learning_rate": 2.370001590090709e-11, + "loss": 0.835585, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.549729824066162 + }, + { + "auxiliary_loss_clip": 0.01117061, + "auxiliary_loss_mlp": 0.01102808, + "balance_loss_clip": 1.00159013, + "balance_loss_mlp": 1.0003829, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.9217832242622817, + "language_loss": 0.66793251, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69013119, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.709202766418457 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01102933, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00041318, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 1.9160548700832667, + "language_loss": 0.80736017, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82938492, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 4.014681339263916 + }, + { + "auxiliary_loss_clip": 0.0113314, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.00173008, + "balance_loss_mlp": 1.00057459, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 6.9245133109399175, + "language_loss": 0.62860316, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65094543, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.578598976135254 + }, + { + "auxiliary_loss_clip": 0.01145316, + "auxiliary_loss_mlp": 0.01102456, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00050831, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.176512411791848, + "language_loss": 0.67701685, + "learning_rate": 1.672274094288717e-11, + "loss": 0.6994946, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.5350801944732666 + }, + { + "auxiliary_loss_clip": 0.01117492, + "auxiliary_loss_mlp": 0.01103716, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00052822, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.4056870798399668, + "language_loss": 0.69750959, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71972167, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.7023708820343018 + }, + { + "auxiliary_loss_clip": 0.01131542, + "auxiliary_loss_mlp": 0.01102019, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00045276, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.3841800093430006, + "language_loss": 0.73792893, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76026458, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 3.995666027069092 + }, + { + "auxiliary_loss_clip": 0.01116467, + "auxiliary_loss_mlp": 0.00747413, + "balance_loss_clip": 1.00163746, + "balance_loss_mlp": 1.00039601, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 1.9203217552055307, + "language_loss": 0.73185873, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75049758, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.57702374458313 + }, + { + "auxiliary_loss_clip": 0.01150288, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00043166, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.6793048489441367, + "language_loss": 0.72435957, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74688911, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.5454623699188232 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.0074734, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00043726, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 1.9719198114156113, + "language_loss": 0.77238655, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79151148, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 3.83538818359375 + }, + { + "auxiliary_loss_clip": 0.01131306, + "auxiliary_loss_mlp": 0.01102921, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00040126, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.856669440085281, + "language_loss": 0.82990563, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85224789, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.531334638595581 + }, + { + "auxiliary_loss_clip": 0.01133308, + "auxiliary_loss_mlp": 0.01101424, + "balance_loss_clip": 1.00163758, + "balance_loss_mlp": 1.00023878, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.5245768687503387, + "language_loss": 0.78771091, + "learning_rate": 7.43233506206309e-12, + "loss": 0.81005818, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.6016485691070557 + }, + { + "auxiliary_loss_clip": 0.01164876, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_clip": 1.00180733, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.713486712974835, + "language_loss": 0.74739361, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77006626, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.5037682056427 + }, + { + "auxiliary_loss_clip": 0.01150039, + "auxiliary_loss_mlp": 0.01101424, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.0003345, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 1.8721425875413813, + "language_loss": 0.86923742, + "learning_rate": 5.460491963260594e-12, + "loss": 0.891752, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 2.524021625518799 + }, + { + "auxiliary_loss_clip": 0.01116285, + "auxiliary_loss_mlp": 0.01101892, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.00037336, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.771416556157077, + "language_loss": 0.7232998, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74548161, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.640899419784546 + }, + { + "auxiliary_loss_clip": 0.01110461, + "auxiliary_loss_mlp": 0.0107898, + "balance_loss_clip": 1.00106871, + "balance_loss_mlp": 1.00001574, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7599930973823087, + "language_loss": 0.56568694, + "learning_rate": 3.79200883515729e-12, + "loss": 0.5875814, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.4244065284729004 + }, + { + "auxiliary_loss_clip": 0.01118653, + "auxiliary_loss_mlp": 0.01102978, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00036216, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.627760646336216, + "language_loss": 0.70909417, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73131049, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.6014981269836426 + }, + { + "auxiliary_loss_clip": 0.01114366, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.0019685, + "balance_loss_mlp": 1.00044072, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 1.7872928836828001, + "language_loss": 0.74752039, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76969844, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 2.5850048065185547 + }, + { + "auxiliary_loss_clip": 0.01116392, + "auxiliary_loss_mlp": 0.01103089, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.00037825, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.5495793879042272, + "language_loss": 0.73952776, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.76172262, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.6727147102355957 + }, + { + "auxiliary_loss_clip": 0.01148016, + "auxiliary_loss_mlp": 0.01102176, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.0004189, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 3.397004690682507, + "language_loss": 0.77108407, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.79358602, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.5367374420166016 + }, + { + "auxiliary_loss_clip": 0.01164866, + "auxiliary_loss_mlp": 0.01103538, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00054169, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.9081924129704126, + "language_loss": 0.82057542, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84325945, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.6261045932769775 + }, + { + "auxiliary_loss_clip": 0.01145674, + "auxiliary_loss_mlp": 0.01103263, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00045717, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.1916521923194083, + "language_loss": 0.71048284, + "learning_rate": 6.067215747584952e-13, + "loss": 0.7329722, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.5997724533081055 + }, + { + "auxiliary_loss_clip": 0.01149636, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00170028, + "balance_loss_mlp": 1.00043106, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.4193822242218472, + "language_loss": 0.75430948, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77682769, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.538975477218628 + }, + { + "auxiliary_loss_clip": 0.01131692, + "auxiliary_loss_mlp": 0.01104004, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00043464, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.7019973400070605, + "language_loss": 0.60396278, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62631977, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.5542287826538086 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00041473, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 4.316822601191206, + "language_loss": 0.60020518, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62225652, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.6390466690063477 + }, + { + "auxiliary_loss_clip": 0.011001, + "auxiliary_loss_mlp": 0.00747394, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00042605, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.7306806586106624, + "language_loss": 0.72814345, + "learning_rate": 0.0, + "loss": 0.74661845, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.593424081802368 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7704915717412584, + "train_runtime": 47602.0927, + "train_samples_per_second": 13.976, + "train_steps_per_second": 0.349 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/training_args.bin b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3149ecb06ac65a3aea922af640ea3fc44466226a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dca0d87fd4cf21dba2781d9ed4ca6c420f1f15440dc50ff1c08e99716f599d4 +size 7992